1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
16 #include <linux/frame.h>
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
25 #include <linux/sched.h>
26 #include <linux/sched/smt.h>
27 #include <linux/slab.h>
28 #include <linux/tboot.h>
29 #include <linux/trace_events.h>
30 #include <linux/entry-kvm.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/debugreg.h>
38 #include <asm/fpu/internal.h>
40 #include <asm/irq_remapping.h>
41 #include <asm/kexec.h>
42 #include <asm/perf_event.h>
44 #include <asm/mmu_context.h>
45 #include <asm/mshyperv.h>
46 #include <asm/mwait.h>
47 #include <asm/spec-ctrl.h>
48 #include <asm/virtext.h>
51 #include "capabilities.h"
55 #include "kvm_cache_regs.h"
67 MODULE_AUTHOR("Qumranet");
68 MODULE_LICENSE("GPL");
71 static const struct x86_cpu_id vmx_cpu_id[] = {
72 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
75 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
78 bool __read_mostly enable_vpid = 1;
79 module_param_named(vpid, enable_vpid, bool, 0444);
81 static bool __read_mostly enable_vnmi = 1;
82 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
84 bool __read_mostly flexpriority_enabled = 1;
85 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
87 bool __read_mostly enable_ept = 1;
88 module_param_named(ept, enable_ept, bool, S_IRUGO);
90 bool __read_mostly enable_unrestricted_guest = 1;
91 module_param_named(unrestricted_guest,
92 enable_unrestricted_guest, bool, S_IRUGO);
94 bool __read_mostly enable_ept_ad_bits = 1;
95 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
97 static bool __read_mostly emulate_invalid_guest_state = true;
98 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
100 static bool __read_mostly fasteoi = 1;
101 module_param(fasteoi, bool, S_IRUGO);
103 bool __read_mostly enable_apicv = 1;
104 module_param(enable_apicv, bool, S_IRUGO);
107 * If nested=1, nested virtualization is supported, i.e., guests may use
108 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
109 * use VMX instructions.
111 static bool __read_mostly nested = 1;
112 module_param(nested, bool, S_IRUGO);
114 bool __read_mostly enable_pml = 1;
115 module_param_named(pml, enable_pml, bool, S_IRUGO);
117 static bool __read_mostly dump_invalid_vmcs = 0;
118 module_param(dump_invalid_vmcs, bool, 0644);
120 #define MSR_BITMAP_MODE_X2APIC 1
121 #define MSR_BITMAP_MODE_X2APIC_APICV 2
123 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
125 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
126 static int __read_mostly cpu_preemption_timer_multi;
127 static bool __read_mostly enable_preemption_timer = 1;
129 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
132 extern bool __read_mostly allow_smaller_maxphyaddr;
133 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
135 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
136 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
137 #define KVM_VM_CR0_ALWAYS_ON \
138 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
139 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
141 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
142 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
143 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
145 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
147 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
148 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
149 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
150 RTIT_STATUS_BYTECNT))
152 #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
153 (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
156 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
157 * ple_gap: upper bound on the amount of time between two successive
158 * executions of PAUSE in a loop. Also indicate if ple enabled.
159 * According to test, this time is usually smaller than 128 cycles.
160 * ple_window: upper bound on the amount of time a guest is allowed to execute
161 * in a PAUSE loop. Tests indicate that most spinlocks are held for
162 * less than 2^12 cycles
163 * Time is measured based on a counter that runs at the same rate as the TSC,
164 * refer SDM volume 3b section 21.6.13 & 22.1.3.
166 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
167 module_param(ple_gap, uint, 0444);
169 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
170 module_param(ple_window, uint, 0444);
172 /* Default doubles per-vcpu window every exit. */
173 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
174 module_param(ple_window_grow, uint, 0444);
176 /* Default resets per-vcpu window every exit to ple_window. */
177 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
178 module_param(ple_window_shrink, uint, 0444);
180 /* Default is to compute the maximum so we can never overflow. */
181 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
182 module_param(ple_window_max, uint, 0444);
184 /* Default is SYSTEM mode, 1 for host-guest mode */
185 int __read_mostly pt_mode = PT_MODE_SYSTEM;
186 module_param(pt_mode, int, S_IRUGO);
188 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
189 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
190 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
192 /* Storage for pre module init parameter parsing */
193 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
195 static const struct {
198 } vmentry_l1d_param[] = {
199 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
200 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
201 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
202 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
203 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
204 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
207 #define L1D_CACHE_ORDER 4
208 static void *vmx_l1d_flush_pages;
210 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
215 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
216 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
221 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
225 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
228 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
229 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
230 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
235 /* If set to auto use the default l1tf mitigation method */
236 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
237 switch (l1tf_mitigation) {
238 case L1TF_MITIGATION_OFF:
239 l1tf = VMENTER_L1D_FLUSH_NEVER;
241 case L1TF_MITIGATION_FLUSH_NOWARN:
242 case L1TF_MITIGATION_FLUSH:
243 case L1TF_MITIGATION_FLUSH_NOSMT:
244 l1tf = VMENTER_L1D_FLUSH_COND;
246 case L1TF_MITIGATION_FULL:
247 case L1TF_MITIGATION_FULL_FORCE:
248 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
251 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
252 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
255 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
256 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
258 * This allocation for vmx_l1d_flush_pages is not tied to a VM
259 * lifetime and so should not be charged to a memcg.
261 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
264 vmx_l1d_flush_pages = page_address(page);
267 * Initialize each page with a different pattern in
268 * order to protect against KSM in the nested
269 * virtualization case.
271 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
272 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
277 l1tf_vmx_mitigation = l1tf;
279 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
280 static_branch_enable(&vmx_l1d_should_flush);
282 static_branch_disable(&vmx_l1d_should_flush);
284 if (l1tf == VMENTER_L1D_FLUSH_COND)
285 static_branch_enable(&vmx_l1d_flush_cond);
287 static_branch_disable(&vmx_l1d_flush_cond);
291 static int vmentry_l1d_flush_parse(const char *s)
296 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
297 if (vmentry_l1d_param[i].for_parse &&
298 sysfs_streq(s, vmentry_l1d_param[i].option))
305 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
309 l1tf = vmentry_l1d_flush_parse(s);
313 if (!boot_cpu_has(X86_BUG_L1TF))
317 * Has vmx_init() run already? If not then this is the pre init
318 * parameter parsing. In that case just store the value and let
319 * vmx_init() do the proper setup after enable_ept has been
322 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
323 vmentry_l1d_flush_param = l1tf;
327 mutex_lock(&vmx_l1d_flush_mutex);
328 ret = vmx_setup_l1d_flush(l1tf);
329 mutex_unlock(&vmx_l1d_flush_mutex);
333 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
335 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
336 return sprintf(s, "???\n");
338 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
341 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
342 .set = vmentry_l1d_flush_set,
343 .get = vmentry_l1d_flush_get,
345 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
347 static bool guest_state_valid(struct kvm_vcpu *vcpu);
348 static u32 vmx_segment_access_rights(struct kvm_segment *var);
349 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
352 void vmx_vmexit(void);
354 #define vmx_insn_failed(fmt...) \
357 pr_warn_ratelimited(fmt); \
360 asmlinkage void vmread_error(unsigned long field, bool fault)
363 kvm_spurious_fault();
365 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
368 noinline void vmwrite_error(unsigned long field, unsigned long value)
370 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
371 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
374 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
376 vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
379 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
381 vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
384 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
386 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
390 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
392 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
396 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
397 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
399 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
400 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
402 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
405 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
406 * can find which vCPU should be waken up.
408 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
409 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
411 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
412 static DEFINE_SPINLOCK(vmx_vpid_lock);
414 struct vmcs_config vmcs_config;
415 struct vmx_capability vmx_capability;
417 #define VMX_SEGMENT_FIELD(seg) \
418 [VCPU_SREG_##seg] = { \
419 .selector = GUEST_##seg##_SELECTOR, \
420 .base = GUEST_##seg##_BASE, \
421 .limit = GUEST_##seg##_LIMIT, \
422 .ar_bytes = GUEST_##seg##_AR_BYTES, \
425 static const struct kvm_vmx_segment_field {
430 } kvm_vmx_segment_fields[] = {
431 VMX_SEGMENT_FIELD(CS),
432 VMX_SEGMENT_FIELD(DS),
433 VMX_SEGMENT_FIELD(ES),
434 VMX_SEGMENT_FIELD(FS),
435 VMX_SEGMENT_FIELD(GS),
436 VMX_SEGMENT_FIELD(SS),
437 VMX_SEGMENT_FIELD(TR),
438 VMX_SEGMENT_FIELD(LDTR),
441 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
443 vmx->segment_cache.bitmask = 0;
446 static unsigned long host_idt_base;
449 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
450 * will emulate SYSCALL in legacy mode if the vendor string in guest
451 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
452 * support this emulation, IA32_STAR must always be included in
453 * vmx_msr_index[], even in i386 builds.
455 const u32 vmx_msr_index[] = {
457 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
459 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
463 #if IS_ENABLED(CONFIG_HYPERV)
464 static bool __read_mostly enlightened_vmcs = true;
465 module_param(enlightened_vmcs, bool, 0444);
467 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
468 static void check_ept_pointer_match(struct kvm *kvm)
470 struct kvm_vcpu *vcpu;
471 u64 tmp_eptp = INVALID_PAGE;
474 kvm_for_each_vcpu(i, vcpu, kvm) {
475 if (!VALID_PAGE(tmp_eptp)) {
476 tmp_eptp = to_vmx(vcpu)->ept_pointer;
477 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
478 to_kvm_vmx(kvm)->ept_pointers_match
479 = EPT_POINTERS_MISMATCH;
484 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
487 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
490 struct kvm_tlb_range *range = data;
492 return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
496 static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
497 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
499 u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
502 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
503 * of the base of EPT PML4 table, strip off EPT configuration
507 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
508 kvm_fill_hv_flush_list_func, (void *)range);
510 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
513 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
514 struct kvm_tlb_range *range)
516 struct kvm_vcpu *vcpu;
519 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
521 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
522 check_ept_pointer_match(kvm);
524 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
525 kvm_for_each_vcpu(i, vcpu, kvm) {
526 /* If ept_pointer is invalid pointer, bypass flush request. */
527 if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
528 ret |= __hv_remote_flush_tlb_with_range(
532 ret = __hv_remote_flush_tlb_with_range(kvm,
533 kvm_get_vcpu(kvm, 0), range);
536 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
539 static int hv_remote_flush_tlb(struct kvm *kvm)
541 return hv_remote_flush_tlb_with_range(kvm, NULL);
544 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
546 struct hv_enlightened_vmcs *evmcs;
547 struct hv_partition_assist_pg **p_hv_pa_pg =
548 &vcpu->kvm->arch.hyperv.hv_pa_pg;
550 * Synthetic VM-Exit is not enabled in current code and so All
551 * evmcs in singe VM shares same assist page.
554 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
559 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
561 evmcs->partition_assist_page =
563 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
564 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
569 #endif /* IS_ENABLED(CONFIG_HYPERV) */
572 * Comment's format: document - errata name - stepping - processor name.
574 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
576 static u32 vmx_preemption_cpu_tfms[] = {
577 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
579 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */
580 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
581 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
583 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
585 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
586 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
588 * 320767.pdf - AAP86 - B1 -
589 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
592 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
594 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
596 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
598 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
599 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
600 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
602 /* Xeon E3-1220 V2 */
606 static inline bool cpu_has_broken_vmx_preemption_timer(void)
608 u32 eax = cpuid_eax(0x00000001), i;
610 /* Clear the reserved bits */
611 eax &= ~(0x3U << 14 | 0xfU << 28);
612 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
613 if (eax == vmx_preemption_cpu_tfms[i])
619 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
621 return flexpriority_enabled && lapic_in_kernel(vcpu);
624 static inline bool report_flexpriority(void)
626 return flexpriority_enabled;
629 static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
633 for (i = 0; i < vmx->nmsrs; ++i)
634 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
639 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
643 i = __find_msr_index(vmx, msr);
645 return &vmx->guest_msrs[i];
649 static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
653 u64 old_msr_data = msr->data;
655 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
657 ret = kvm_set_shared_msr(msr->index, msr->data,
661 msr->data = old_msr_data;
666 #ifdef CONFIG_KEXEC_CORE
667 static void crash_vmclear_local_loaded_vmcss(void)
669 int cpu = raw_smp_processor_id();
670 struct loaded_vmcs *v;
672 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
673 loaded_vmcss_on_cpu_link)
676 #endif /* CONFIG_KEXEC_CORE */
678 static void __loaded_vmcs_clear(void *arg)
680 struct loaded_vmcs *loaded_vmcs = arg;
681 int cpu = raw_smp_processor_id();
683 if (loaded_vmcs->cpu != cpu)
684 return; /* vcpu migration can race with cpu offline */
685 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
686 per_cpu(current_vmcs, cpu) = NULL;
688 vmcs_clear(loaded_vmcs->vmcs);
689 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
690 vmcs_clear(loaded_vmcs->shadow_vmcs);
692 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
695 * Ensure all writes to loaded_vmcs, including deleting it from its
696 * current percpu list, complete before setting loaded_vmcs->vcpu to
697 * -1, otherwise a different cpu can see vcpu == -1 first and add
698 * loaded_vmcs to its percpu list before it's deleted from this cpu's
699 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
703 loaded_vmcs->cpu = -1;
704 loaded_vmcs->launched = 0;
707 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
709 int cpu = loaded_vmcs->cpu;
712 smp_call_function_single(cpu,
713 __loaded_vmcs_clear, loaded_vmcs, 1);
716 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
720 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
722 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
723 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
724 vmx->segment_cache.bitmask = 0;
726 ret = vmx->segment_cache.bitmask & mask;
727 vmx->segment_cache.bitmask |= mask;
731 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
733 u16 *p = &vmx->segment_cache.seg[seg].selector;
735 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
736 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
740 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
742 ulong *p = &vmx->segment_cache.seg[seg].base;
744 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
745 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
749 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
751 u32 *p = &vmx->segment_cache.seg[seg].limit;
753 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
754 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
758 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
760 u32 *p = &vmx->segment_cache.seg[seg].ar;
762 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
763 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
767 void update_exception_bitmap(struct kvm_vcpu *vcpu)
771 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
772 (1u << DB_VECTOR) | (1u << AC_VECTOR);
774 * Guest access to VMware backdoor ports could legitimately
775 * trigger #GP because of TSS I/O permission bitmap.
776 * We intercept those #GP and allow access to them anyway
779 if (enable_vmware_backdoor)
780 eb |= (1u << GP_VECTOR);
781 if ((vcpu->guest_debug &
782 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
783 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
784 eb |= 1u << BP_VECTOR;
785 if (to_vmx(vcpu)->rmode.vm86_active)
787 if (!vmx_need_pf_intercept(vcpu))
788 eb &= ~(1u << PF_VECTOR);
790 /* When we are running a nested L2 guest and L1 specified for it a
791 * certain exception bitmap, we must trap the same exceptions and pass
792 * them to L1. When running L2, we will only handle the exceptions
793 * specified above if L1 did not want them.
795 if (is_guest_mode(vcpu))
796 eb |= get_vmcs12(vcpu)->exception_bitmap;
799 * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
800 * between guest and host. In that case we only care about present
801 * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
802 * prepare_vmcs02_rare.
804 bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
805 int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
806 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
807 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
810 vmcs_write32(EXCEPTION_BITMAP, eb);
814 * Check if MSR is intercepted for currently loaded MSR bitmap.
816 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
818 unsigned long *msr_bitmap;
819 int f = sizeof(unsigned long);
821 if (!cpu_has_vmx_msr_bitmap())
824 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
827 return !!test_bit(msr, msr_bitmap + 0x800 / f);
828 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
830 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
836 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
837 unsigned long entry, unsigned long exit)
839 vm_entry_controls_clearbit(vmx, entry);
840 vm_exit_controls_clearbit(vmx, exit);
843 int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
847 for (i = 0; i < m->nr; ++i) {
848 if (m->val[i].index == msr)
854 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
857 struct msr_autoload *m = &vmx->msr_autoload;
861 if (cpu_has_load_ia32_efer()) {
862 clear_atomic_switch_msr_special(vmx,
863 VM_ENTRY_LOAD_IA32_EFER,
864 VM_EXIT_LOAD_IA32_EFER);
868 case MSR_CORE_PERF_GLOBAL_CTRL:
869 if (cpu_has_load_perf_global_ctrl()) {
870 clear_atomic_switch_msr_special(vmx,
871 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
872 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
877 i = vmx_find_msr_index(&m->guest, msr);
881 m->guest.val[i] = m->guest.val[m->guest.nr];
882 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
885 i = vmx_find_msr_index(&m->host, msr);
890 m->host.val[i] = m->host.val[m->host.nr];
891 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
894 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
895 unsigned long entry, unsigned long exit,
896 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
897 u64 guest_val, u64 host_val)
899 vmcs_write64(guest_val_vmcs, guest_val);
900 if (host_val_vmcs != HOST_IA32_EFER)
901 vmcs_write64(host_val_vmcs, host_val);
902 vm_entry_controls_setbit(vmx, entry);
903 vm_exit_controls_setbit(vmx, exit);
906 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
907 u64 guest_val, u64 host_val, bool entry_only)
910 struct msr_autoload *m = &vmx->msr_autoload;
914 if (cpu_has_load_ia32_efer()) {
915 add_atomic_switch_msr_special(vmx,
916 VM_ENTRY_LOAD_IA32_EFER,
917 VM_EXIT_LOAD_IA32_EFER,
920 guest_val, host_val);
924 case MSR_CORE_PERF_GLOBAL_CTRL:
925 if (cpu_has_load_perf_global_ctrl()) {
926 add_atomic_switch_msr_special(vmx,
927 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
928 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
929 GUEST_IA32_PERF_GLOBAL_CTRL,
930 HOST_IA32_PERF_GLOBAL_CTRL,
931 guest_val, host_val);
935 case MSR_IA32_PEBS_ENABLE:
936 /* PEBS needs a quiescent period after being disabled (to write
937 * a record). Disabling PEBS through VMX MSR swapping doesn't
938 * provide that period, so a CPU could write host's record into
941 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
944 i = vmx_find_msr_index(&m->guest, msr);
946 j = vmx_find_msr_index(&m->host, msr);
948 if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
949 (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
950 printk_once(KERN_WARNING "Not enough msr switch entries. "
951 "Can't add msr %x\n", msr);
956 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
958 m->guest.val[i].index = msr;
959 m->guest.val[i].value = guest_val;
966 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
968 m->host.val[j].index = msr;
969 m->host.val[j].value = host_val;
972 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
974 u64 guest_efer = vmx->vcpu.arch.efer;
977 /* Shadow paging assumes NX to be available. */
979 guest_efer |= EFER_NX;
982 * LMA and LME handled by hardware; SCE meaningless outside long mode.
984 ignore_bits |= EFER_SCE;
986 ignore_bits |= EFER_LMA | EFER_LME;
987 /* SCE is meaningful only in long mode on Intel */
988 if (guest_efer & EFER_LMA)
989 ignore_bits &= ~(u64)EFER_SCE;
993 * On EPT, we can't emulate NX, so we must switch EFER atomically.
994 * On CPUs that support "load IA32_EFER", always switch EFER
995 * atomically, since it's faster than switching it manually.
997 if (cpu_has_load_ia32_efer() ||
998 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
999 if (!(guest_efer & EFER_LMA))
1000 guest_efer &= ~EFER_LME;
1001 if (guest_efer != host_efer)
1002 add_atomic_switch_msr(vmx, MSR_EFER,
1003 guest_efer, host_efer, false);
1005 clear_atomic_switch_msr(vmx, MSR_EFER);
1008 clear_atomic_switch_msr(vmx, MSR_EFER);
1010 guest_efer &= ~ignore_bits;
1011 guest_efer |= host_efer & ignore_bits;
1013 vmx->guest_msrs[efer_offset].data = guest_efer;
1014 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1020 #ifdef CONFIG_X86_32
1022 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1023 * VMCS rather than the segment table. KVM uses this helper to figure
1024 * out the current bases to poke them into the VMCS before entry.
1026 static unsigned long segment_base(u16 selector)
1028 struct desc_struct *table;
1031 if (!(selector & ~SEGMENT_RPL_MASK))
1034 table = get_current_gdt_ro();
1036 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1037 u16 ldt_selector = kvm_read_ldt();
1039 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1042 table = (struct desc_struct *)segment_base(ldt_selector);
1044 v = get_desc_base(&table[selector >> 3]);
1049 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1051 return vmx_pt_mode_is_host_guest() &&
1052 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1055 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1059 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1060 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1061 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1062 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1063 for (i = 0; i < addr_range; i++) {
1064 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1065 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1069 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1073 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1074 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1075 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1076 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1077 for (i = 0; i < addr_range; i++) {
1078 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1079 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1083 static void pt_guest_enter(struct vcpu_vmx *vmx)
1085 if (vmx_pt_mode_is_system())
1089 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1090 * Save host state before VM entry.
1092 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1093 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1094 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1095 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1096 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1100 static void pt_guest_exit(struct vcpu_vmx *vmx)
1102 if (vmx_pt_mode_is_system())
1105 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1106 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1107 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1110 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1111 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1114 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1115 unsigned long fs_base, unsigned long gs_base)
1117 if (unlikely(fs_sel != host->fs_sel)) {
1119 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1121 vmcs_write16(HOST_FS_SELECTOR, 0);
1122 host->fs_sel = fs_sel;
1124 if (unlikely(gs_sel != host->gs_sel)) {
1126 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1128 vmcs_write16(HOST_GS_SELECTOR, 0);
1129 host->gs_sel = gs_sel;
1131 if (unlikely(fs_base != host->fs_base)) {
1132 vmcs_writel(HOST_FS_BASE, fs_base);
1133 host->fs_base = fs_base;
1135 if (unlikely(gs_base != host->gs_base)) {
1136 vmcs_writel(HOST_GS_BASE, gs_base);
1137 host->gs_base = gs_base;
1141 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1143 struct vcpu_vmx *vmx = to_vmx(vcpu);
1144 struct vmcs_host_state *host_state;
1145 #ifdef CONFIG_X86_64
1146 int cpu = raw_smp_processor_id();
1148 unsigned long fs_base, gs_base;
1152 vmx->req_immediate_exit = false;
1155 * Note that guest MSRs to be saved/restored can also be changed
1156 * when guest state is loaded. This happens when guest transitions
1157 * to/from long-mode by setting MSR_EFER.LMA.
1159 if (!vmx->guest_msrs_ready) {
1160 vmx->guest_msrs_ready = true;
1161 for (i = 0; i < vmx->save_nmsrs; ++i)
1162 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1163 vmx->guest_msrs[i].data,
1164 vmx->guest_msrs[i].mask);
1168 if (vmx->nested.need_vmcs12_to_shadow_sync)
1169 nested_sync_vmcs12_to_shadow(vcpu);
1171 if (vmx->guest_state_loaded)
1174 host_state = &vmx->loaded_vmcs->host_state;
1177 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1178 * allow segment selectors with cpl > 0 or ti == 1.
1180 host_state->ldt_sel = kvm_read_ldt();
1182 #ifdef CONFIG_X86_64
1183 savesegment(ds, host_state->ds_sel);
1184 savesegment(es, host_state->es_sel);
1186 gs_base = cpu_kernelmode_gs_base(cpu);
1187 if (likely(is_64bit_mm(current->mm))) {
1188 current_save_fsgs();
1189 fs_sel = current->thread.fsindex;
1190 gs_sel = current->thread.gsindex;
1191 fs_base = current->thread.fsbase;
1192 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1194 savesegment(fs, fs_sel);
1195 savesegment(gs, gs_sel);
1196 fs_base = read_msr(MSR_FS_BASE);
1197 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1200 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1202 savesegment(fs, fs_sel);
1203 savesegment(gs, gs_sel);
1204 fs_base = segment_base(fs_sel);
1205 gs_base = segment_base(gs_sel);
1208 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1209 vmx->guest_state_loaded = true;
1212 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1214 struct vmcs_host_state *host_state;
1216 if (!vmx->guest_state_loaded)
1219 host_state = &vmx->loaded_vmcs->host_state;
1221 ++vmx->vcpu.stat.host_state_reload;
1223 #ifdef CONFIG_X86_64
1224 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1226 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1227 kvm_load_ldt(host_state->ldt_sel);
1228 #ifdef CONFIG_X86_64
1229 load_gs_index(host_state->gs_sel);
1231 loadsegment(gs, host_state->gs_sel);
1234 if (host_state->fs_sel & 7)
1235 loadsegment(fs, host_state->fs_sel);
1236 #ifdef CONFIG_X86_64
1237 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1238 loadsegment(ds, host_state->ds_sel);
1239 loadsegment(es, host_state->es_sel);
1242 invalidate_tss_limit();
1243 #ifdef CONFIG_X86_64
1244 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1246 load_fixmap_gdt(raw_smp_processor_id());
1247 vmx->guest_state_loaded = false;
1248 vmx->guest_msrs_ready = false;
1251 #ifdef CONFIG_X86_64
1252 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1255 if (vmx->guest_state_loaded)
1256 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1258 return vmx->msr_guest_kernel_gs_base;
1261 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1264 if (vmx->guest_state_loaded)
1265 wrmsrl(MSR_KERNEL_GS_BASE, data);
1267 vmx->msr_guest_kernel_gs_base = data;
1271 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1273 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1274 struct pi_desc old, new;
1278 * In case of hot-plug or hot-unplug, we may have to undo
1279 * vmx_vcpu_pi_put even if there is no assigned device. And we
1280 * always keep PI.NDST up to date for simplicity: it makes the
1281 * code easier, and CPU migration is not a fast path.
1283 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
1287 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
1288 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
1289 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
1290 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
1293 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
1294 pi_clear_sn(pi_desc);
1295 goto after_clear_sn;
1298 /* The full case. */
1300 old.control = new.control = pi_desc->control;
1302 dest = cpu_physical_id(cpu);
1304 if (x2apic_enabled())
1307 new.ndst = (dest << 8) & 0xFF00;
1310 } while (cmpxchg64(&pi_desc->control, old.control,
1311 new.control) != old.control);
1316 * Clear SN before reading the bitmap. The VT-d firmware
1317 * writes the bitmap and reads SN atomically (5.2.3 in the
1318 * spec), so it doesn't really have a memory barrier that
1319 * pairs with this, but we cannot do that and we need one.
1321 smp_mb__after_atomic();
1323 if (!pi_is_pir_empty(pi_desc))
1327 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1328 struct loaded_vmcs *buddy)
1330 struct vcpu_vmx *vmx = to_vmx(vcpu);
1331 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1334 if (!already_loaded) {
1335 loaded_vmcs_clear(vmx->loaded_vmcs);
1336 local_irq_disable();
1339 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1340 * this cpu's percpu list, otherwise it may not yet be deleted
1341 * from its previous cpu's percpu list. Pairs with the
1342 * smb_wmb() in __loaded_vmcs_clear().
1346 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1347 &per_cpu(loaded_vmcss_on_cpu, cpu));
1351 prev = per_cpu(current_vmcs, cpu);
1352 if (prev != vmx->loaded_vmcs->vmcs) {
1353 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1354 vmcs_load(vmx->loaded_vmcs->vmcs);
1357 * No indirect branch prediction barrier needed when switching
1358 * the active VMCS within a guest, e.g. on nested VM-Enter.
1359 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
1361 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1362 indirect_branch_prediction_barrier();
1365 if (!already_loaded) {
1366 void *gdt = get_current_gdt_ro();
1367 unsigned long sysenter_esp;
1370 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1371 * TLB entries from its previous association with the vCPU.
1373 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1376 * Linux uses per-cpu TSS and GDT, so set these when switching
1377 * processors. See 22.2.4.
1379 vmcs_writel(HOST_TR_BASE,
1380 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1381 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
1383 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1384 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1386 vmx->loaded_vmcs->cpu = cpu;
1389 /* Setup TSC multiplier */
1390 if (kvm_has_tsc_control &&
1391 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1392 decache_tsc_multiplier(vmx);
1396 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1397 * vcpu mutex is already taken.
1399 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1401 struct vcpu_vmx *vmx = to_vmx(vcpu);
1403 vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1405 vmx_vcpu_pi_load(vcpu, cpu);
1407 vmx->host_debugctlmsr = get_debugctlmsr();
1410 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
1412 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1414 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
1415 !irq_remapping_cap(IRQ_POSTING_CAP) ||
1416 !kvm_vcpu_apicv_active(vcpu))
1419 /* Set SN when the vCPU is preempted */
1420 if (vcpu->preempted)
1424 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1426 vmx_vcpu_pi_put(vcpu);
1428 vmx_prepare_switch_to_host(to_vmx(vcpu));
1431 static bool emulation_required(struct kvm_vcpu *vcpu)
1433 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
1436 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1438 struct vcpu_vmx *vmx = to_vmx(vcpu);
1439 unsigned long rflags, save_rflags;
1441 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1442 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1443 rflags = vmcs_readl(GUEST_RFLAGS);
1444 if (vmx->rmode.vm86_active) {
1445 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1446 save_rflags = vmx->rmode.save_rflags;
1447 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1449 vmx->rflags = rflags;
1454 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1456 struct vcpu_vmx *vmx = to_vmx(vcpu);
1457 unsigned long old_rflags;
1459 if (enable_unrestricted_guest) {
1460 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1461 vmx->rflags = rflags;
1462 vmcs_writel(GUEST_RFLAGS, rflags);
1466 old_rflags = vmx_get_rflags(vcpu);
1467 vmx->rflags = rflags;
1468 if (vmx->rmode.vm86_active) {
1469 vmx->rmode.save_rflags = rflags;
1470 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1472 vmcs_writel(GUEST_RFLAGS, rflags);
1474 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1475 vmx->emulation_required = emulation_required(vcpu);
1478 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1480 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1483 if (interruptibility & GUEST_INTR_STATE_STI)
1484 ret |= KVM_X86_SHADOW_INT_STI;
1485 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1486 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1491 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1493 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1494 u32 interruptibility = interruptibility_old;
1496 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1498 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1499 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1500 else if (mask & KVM_X86_SHADOW_INT_STI)
1501 interruptibility |= GUEST_INTR_STATE_STI;
1503 if ((interruptibility != interruptibility_old))
1504 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1507 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1509 struct vcpu_vmx *vmx = to_vmx(vcpu);
1510 unsigned long value;
1513 * Any MSR write that attempts to change bits marked reserved will
1516 if (data & vmx->pt_desc.ctl_bitmask)
1520 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1521 * result in a #GP unless the same write also clears TraceEn.
1523 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1524 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1528 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1529 * and FabricEn would cause #GP, if
1530 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1532 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1533 !(data & RTIT_CTL_FABRIC_EN) &&
1534 !intel_pt_validate_cap(vmx->pt_desc.caps,
1535 PT_CAP_single_range_output))
1539 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1540 * utilize encodings marked reserved will casue a #GP fault.
1542 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1543 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1544 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1545 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1547 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1548 PT_CAP_cycle_thresholds);
1549 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1550 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1551 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1553 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1554 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1555 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1556 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1560 * If ADDRx_CFG is reserved or the encodings is >2 will
1561 * cause a #GP fault.
1563 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1564 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1566 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1567 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1569 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1570 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1572 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1573 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1579 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1581 unsigned long rip, orig_rip;
1584 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1585 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1586 * set when EPT misconfig occurs. In practice, real hardware updates
1587 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1588 * (namely Hyper-V) don't set it due to it being undefined behavior,
1589 * i.e. we end up advancing IP with some random value.
1591 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1592 to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
1593 orig_rip = kvm_rip_read(vcpu);
1594 rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1595 #ifdef CONFIG_X86_64
1597 * We need to mask out the high 32 bits of RIP if not in 64-bit
1598 * mode, but just finding out that we are in 64-bit mode is
1599 * quite expensive. Only do it if there was a carry.
1601 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1604 kvm_rip_write(vcpu, rip);
1606 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1610 /* skipping an emulated instruction also counts */
1611 vmx_set_interrupt_shadow(vcpu, 0);
1617 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
1618 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
1619 * indicates whether exit to userspace is needed.
1621 int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
1622 struct x86_exception *e)
1624 if (r == X86EMUL_PROPAGATE_FAULT) {
1625 kvm_inject_emulated_page_fault(vcpu, e);
1630 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
1631 * while handling a VMX instruction KVM could've handled the request
1632 * correctly by exiting to userspace and performing I/O but there
1633 * doesn't seem to be a real use-case behind such requests, just return
1634 * KVM_EXIT_INTERNAL_ERROR for now.
1636 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1637 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
1638 vcpu->run->internal.ndata = 0;
1644 * Recognizes a pending MTF VM-exit and records the nested state for later
1647 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1649 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1650 struct vcpu_vmx *vmx = to_vmx(vcpu);
1652 if (!is_guest_mode(vcpu))
1656 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1657 * T-bit traps. As instruction emulation is completed (i.e. at the
1658 * instruction boundary), any #DB exception pending delivery must be a
1659 * debug-trap. Record the pending MTF state to be delivered in
1660 * vmx_check_nested_events().
1662 if (nested_cpu_has_mtf(vmcs12) &&
1663 (!vcpu->arch.exception.pending ||
1664 vcpu->arch.exception.nr == DB_VECTOR))
1665 vmx->nested.mtf_pending = true;
1667 vmx->nested.mtf_pending = false;
1670 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1672 vmx_update_emulated_instruction(vcpu);
1673 return skip_emulated_instruction(vcpu);
1676 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1679 * Ensure that we clear the HLT state in the VMCS. We don't need to
1680 * explicitly skip the instruction because if the HLT state is set,
1681 * then the instruction is already executing and RIP has already been
1684 if (kvm_hlt_in_guest(vcpu->kvm) &&
1685 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1686 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1689 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1691 struct vcpu_vmx *vmx = to_vmx(vcpu);
1692 unsigned nr = vcpu->arch.exception.nr;
1693 bool has_error_code = vcpu->arch.exception.has_error_code;
1694 u32 error_code = vcpu->arch.exception.error_code;
1695 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1697 kvm_deliver_exception_payload(vcpu);
1699 if (has_error_code) {
1700 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1701 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1704 if (vmx->rmode.vm86_active) {
1706 if (kvm_exception_is_soft(nr))
1707 inc_eip = vcpu->arch.event_exit_inst_len;
1708 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
1712 WARN_ON_ONCE(vmx->emulation_required);
1714 if (kvm_exception_is_soft(nr)) {
1715 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1716 vmx->vcpu.arch.event_exit_inst_len);
1717 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1719 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1721 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1723 vmx_clear_hlt(vcpu);
1727 * Swap MSR entry in host/guest MSR entry array.
1729 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1731 struct shared_msr_entry tmp;
1733 tmp = vmx->guest_msrs[to];
1734 vmx->guest_msrs[to] = vmx->guest_msrs[from];
1735 vmx->guest_msrs[from] = tmp;
1739 * Set up the vmcs to automatically save and restore system
1740 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
1741 * mode, as fiddling with msrs is very expensive.
1743 static void setup_msrs(struct vcpu_vmx *vmx)
1745 int save_nmsrs, index;
1748 #ifdef CONFIG_X86_64
1750 * The SYSCALL MSRs are only needed on long mode guests, and only
1751 * when EFER.SCE is set.
1753 if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1754 index = __find_msr_index(vmx, MSR_STAR);
1756 move_msr_up(vmx, index, save_nmsrs++);
1757 index = __find_msr_index(vmx, MSR_LSTAR);
1759 move_msr_up(vmx, index, save_nmsrs++);
1760 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1762 move_msr_up(vmx, index, save_nmsrs++);
1765 index = __find_msr_index(vmx, MSR_EFER);
1766 if (index >= 0 && update_transition_efer(vmx, index))
1767 move_msr_up(vmx, index, save_nmsrs++);
1768 index = __find_msr_index(vmx, MSR_TSC_AUX);
1769 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1770 move_msr_up(vmx, index, save_nmsrs++);
1771 index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
1773 move_msr_up(vmx, index, save_nmsrs++);
1775 vmx->save_nmsrs = save_nmsrs;
1776 vmx->guest_msrs_ready = false;
1778 if (cpu_has_vmx_msr_bitmap())
1779 vmx_update_msr_bitmap(&vmx->vcpu);
1782 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1784 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1785 u64 g_tsc_offset = 0;
1788 * We're here if L1 chose not to trap WRMSR to TSC. According
1789 * to the spec, this should set L1's TSC; The offset that L1
1790 * set for L2 remains unchanged, and still needs to be added
1791 * to the newly set TSC to get L2's TSC.
1793 if (is_guest_mode(vcpu) &&
1794 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
1795 g_tsc_offset = vmcs12->tsc_offset;
1797 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1798 vcpu->arch.tsc_offset - g_tsc_offset,
1800 vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1801 return offset + g_tsc_offset;
1805 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1806 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1807 * all guests if the "nested" module option is off, and can also be disabled
1808 * for a single guest by disabling its VMX cpuid bit.
1810 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1812 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1815 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1818 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1820 return !(val & ~valid_bits);
1823 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1825 switch (msr->index) {
1826 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1829 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1830 case MSR_IA32_PERF_CAPABILITIES:
1831 msr->data = vmx_get_perf_capabilities();
1834 return KVM_MSR_RET_INVALID;
1839 * Reads an msr value (of 'msr_index') into 'pdata'.
1840 * Returns 0 on success, non-0 otherwise.
1841 * Assumes vcpu_load() was already called.
1843 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1845 struct vcpu_vmx *vmx = to_vmx(vcpu);
1846 struct shared_msr_entry *msr;
1849 switch (msr_info->index) {
1850 #ifdef CONFIG_X86_64
1852 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1855 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1857 case MSR_KERNEL_GS_BASE:
1858 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1862 return kvm_get_msr_common(vcpu, msr_info);
1863 case MSR_IA32_TSX_CTRL:
1864 if (!msr_info->host_initiated &&
1865 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1867 goto find_shared_msr;
1868 case MSR_IA32_UMWAIT_CONTROL:
1869 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1872 msr_info->data = vmx->msr_ia32_umwait_control;
1874 case MSR_IA32_SPEC_CTRL:
1875 if (!msr_info->host_initiated &&
1876 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1879 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1881 case MSR_IA32_SYSENTER_CS:
1882 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1884 case MSR_IA32_SYSENTER_EIP:
1885 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1887 case MSR_IA32_SYSENTER_ESP:
1888 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1890 case MSR_IA32_BNDCFGS:
1891 if (!kvm_mpx_supported() ||
1892 (!msr_info->host_initiated &&
1893 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1895 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1897 case MSR_IA32_MCG_EXT_CTL:
1898 if (!msr_info->host_initiated &&
1899 !(vmx->msr_ia32_feature_control &
1900 FEAT_CTL_LMCE_ENABLED))
1902 msr_info->data = vcpu->arch.mcg_ext_ctl;
1904 case MSR_IA32_FEAT_CTL:
1905 msr_info->data = vmx->msr_ia32_feature_control;
1907 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1908 if (!nested_vmx_allowed(vcpu))
1910 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1914 * Enlightened VMCS v1 doesn't have certain fields, but buggy
1915 * Hyper-V versions are still trying to use corresponding
1916 * features when they are exposed. Filter out the essential
1919 if (!msr_info->host_initiated &&
1920 vmx->nested.enlightened_vmcs_enabled)
1921 nested_evmcs_filter_control_msr(msr_info->index,
1924 case MSR_IA32_RTIT_CTL:
1925 if (!vmx_pt_mode_is_host_guest())
1927 msr_info->data = vmx->pt_desc.guest.ctl;
1929 case MSR_IA32_RTIT_STATUS:
1930 if (!vmx_pt_mode_is_host_guest())
1932 msr_info->data = vmx->pt_desc.guest.status;
1934 case MSR_IA32_RTIT_CR3_MATCH:
1935 if (!vmx_pt_mode_is_host_guest() ||
1936 !intel_pt_validate_cap(vmx->pt_desc.caps,
1937 PT_CAP_cr3_filtering))
1939 msr_info->data = vmx->pt_desc.guest.cr3_match;
1941 case MSR_IA32_RTIT_OUTPUT_BASE:
1942 if (!vmx_pt_mode_is_host_guest() ||
1943 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1944 PT_CAP_topa_output) &&
1945 !intel_pt_validate_cap(vmx->pt_desc.caps,
1946 PT_CAP_single_range_output)))
1948 msr_info->data = vmx->pt_desc.guest.output_base;
1950 case MSR_IA32_RTIT_OUTPUT_MASK:
1951 if (!vmx_pt_mode_is_host_guest() ||
1952 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1953 PT_CAP_topa_output) &&
1954 !intel_pt_validate_cap(vmx->pt_desc.caps,
1955 PT_CAP_single_range_output)))
1957 msr_info->data = vmx->pt_desc.guest.output_mask;
1959 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1960 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1961 if (!vmx_pt_mode_is_host_guest() ||
1962 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
1963 PT_CAP_num_address_ranges)))
1966 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1968 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1971 if (!msr_info->host_initiated &&
1972 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1974 goto find_shared_msr;
1977 msr = find_msr_entry(vmx, msr_info->index);
1979 msr_info->data = msr->data;
1982 return kvm_get_msr_common(vcpu, msr_info);
1988 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
1991 #ifdef CONFIG_X86_64
1992 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
1995 return (unsigned long)data;
1999 * Writes msr value into the appropriate "register".
2000 * Returns 0 on success, non-0 otherwise.
2001 * Assumes vcpu_load() was already called.
2003 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2005 struct vcpu_vmx *vmx = to_vmx(vcpu);
2006 struct shared_msr_entry *msr;
2008 u32 msr_index = msr_info->index;
2009 u64 data = msr_info->data;
2012 switch (msr_index) {
2014 ret = kvm_set_msr_common(vcpu, msr_info);
2016 #ifdef CONFIG_X86_64
2018 vmx_segment_cache_clear(vmx);
2019 vmcs_writel(GUEST_FS_BASE, data);
2022 vmx_segment_cache_clear(vmx);
2023 vmcs_writel(GUEST_GS_BASE, data);
2025 case MSR_KERNEL_GS_BASE:
2026 vmx_write_guest_kernel_gs_base(vmx, data);
2029 case MSR_IA32_SYSENTER_CS:
2030 if (is_guest_mode(vcpu))
2031 get_vmcs12(vcpu)->guest_sysenter_cs = data;
2032 vmcs_write32(GUEST_SYSENTER_CS, data);
2034 case MSR_IA32_SYSENTER_EIP:
2035 if (is_guest_mode(vcpu)) {
2036 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2037 get_vmcs12(vcpu)->guest_sysenter_eip = data;
2039 vmcs_writel(GUEST_SYSENTER_EIP, data);
2041 case MSR_IA32_SYSENTER_ESP:
2042 if (is_guest_mode(vcpu)) {
2043 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2044 get_vmcs12(vcpu)->guest_sysenter_esp = data;
2046 vmcs_writel(GUEST_SYSENTER_ESP, data);
2048 case MSR_IA32_DEBUGCTLMSR:
2049 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2050 VM_EXIT_SAVE_DEBUG_CONTROLS)
2051 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2053 ret = kvm_set_msr_common(vcpu, msr_info);
2056 case MSR_IA32_BNDCFGS:
2057 if (!kvm_mpx_supported() ||
2058 (!msr_info->host_initiated &&
2059 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2061 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2062 (data & MSR_IA32_BNDCFGS_RSVD))
2064 vmcs_write64(GUEST_BNDCFGS, data);
2066 case MSR_IA32_UMWAIT_CONTROL:
2067 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2070 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2071 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2074 vmx->msr_ia32_umwait_control = data;
2076 case MSR_IA32_SPEC_CTRL:
2077 if (!msr_info->host_initiated &&
2078 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2081 if (kvm_spec_ctrl_test_value(data))
2084 vmx->spec_ctrl = data;
2090 * When it's written (to non-zero) for the first time, pass
2094 * The handling of the MSR bitmap for L2 guests is done in
2095 * nested_vmx_prepare_msr_bitmap. We should not touch the
2096 * vmcs02.msr_bitmap here since it gets completely overwritten
2097 * in the merging. We update the vmcs01 here for L1 as well
2098 * since it will end up touching the MSR anyway now.
2100 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2104 case MSR_IA32_TSX_CTRL:
2105 if (!msr_info->host_initiated &&
2106 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2108 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2110 goto find_shared_msr;
2111 case MSR_IA32_PRED_CMD:
2112 if (!msr_info->host_initiated &&
2113 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2116 if (data & ~PRED_CMD_IBPB)
2118 if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
2123 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2127 * When it's written (to non-zero) for the first time, pass
2131 * The handling of the MSR bitmap for L2 guests is done in
2132 * nested_vmx_prepare_msr_bitmap. We should not touch the
2133 * vmcs02.msr_bitmap here since it gets completely overwritten
2136 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2139 case MSR_IA32_CR_PAT:
2140 if (!kvm_pat_valid(data))
2143 if (is_guest_mode(vcpu) &&
2144 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2145 get_vmcs12(vcpu)->guest_ia32_pat = data;
2147 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2148 vmcs_write64(GUEST_IA32_PAT, data);
2149 vcpu->arch.pat = data;
2152 ret = kvm_set_msr_common(vcpu, msr_info);
2154 case MSR_IA32_TSC_ADJUST:
2155 ret = kvm_set_msr_common(vcpu, msr_info);
2157 case MSR_IA32_MCG_EXT_CTL:
2158 if ((!msr_info->host_initiated &&
2159 !(to_vmx(vcpu)->msr_ia32_feature_control &
2160 FEAT_CTL_LMCE_ENABLED)) ||
2161 (data & ~MCG_EXT_CTL_LMCE_EN))
2163 vcpu->arch.mcg_ext_ctl = data;
2165 case MSR_IA32_FEAT_CTL:
2166 if (!vmx_feature_control_msr_valid(vcpu, data) ||
2167 (to_vmx(vcpu)->msr_ia32_feature_control &
2168 FEAT_CTL_LOCKED && !msr_info->host_initiated))
2170 vmx->msr_ia32_feature_control = data;
2171 if (msr_info->host_initiated && data == 0)
2172 vmx_leave_nested(vcpu);
2174 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2175 if (!msr_info->host_initiated)
2176 return 1; /* they are read-only */
2177 if (!nested_vmx_allowed(vcpu))
2179 return vmx_set_vmx_msr(vcpu, msr_index, data);
2180 case MSR_IA32_RTIT_CTL:
2181 if (!vmx_pt_mode_is_host_guest() ||
2182 vmx_rtit_ctl_check(vcpu, data) ||
2185 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2186 vmx->pt_desc.guest.ctl = data;
2187 pt_update_intercept_for_msr(vmx);
2189 case MSR_IA32_RTIT_STATUS:
2190 if (!pt_can_write_msr(vmx))
2192 if (data & MSR_IA32_RTIT_STATUS_MASK)
2194 vmx->pt_desc.guest.status = data;
2196 case MSR_IA32_RTIT_CR3_MATCH:
2197 if (!pt_can_write_msr(vmx))
2199 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2200 PT_CAP_cr3_filtering))
2202 vmx->pt_desc.guest.cr3_match = data;
2204 case MSR_IA32_RTIT_OUTPUT_BASE:
2205 if (!pt_can_write_msr(vmx))
2207 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2208 PT_CAP_topa_output) &&
2209 !intel_pt_validate_cap(vmx->pt_desc.caps,
2210 PT_CAP_single_range_output))
2212 if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
2214 vmx->pt_desc.guest.output_base = data;
2216 case MSR_IA32_RTIT_OUTPUT_MASK:
2217 if (!pt_can_write_msr(vmx))
2219 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2220 PT_CAP_topa_output) &&
2221 !intel_pt_validate_cap(vmx->pt_desc.caps,
2222 PT_CAP_single_range_output))
2224 vmx->pt_desc.guest.output_mask = data;
2226 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2227 if (!pt_can_write_msr(vmx))
2229 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2230 if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2231 PT_CAP_num_address_ranges))
2233 if (is_noncanonical_address(data, vcpu))
2236 vmx->pt_desc.guest.addr_b[index / 2] = data;
2238 vmx->pt_desc.guest.addr_a[index / 2] = data;
2241 if (!msr_info->host_initiated &&
2242 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2244 /* Check reserved bit, higher 32 bits should be zero */
2245 if ((data >> 32) != 0)
2247 goto find_shared_msr;
2251 msr = find_msr_entry(vmx, msr_index);
2253 ret = vmx_set_guest_msr(vmx, msr, data);
2255 ret = kvm_set_msr_common(vcpu, msr_info);
2261 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2263 unsigned long guest_owned_bits;
2265 kvm_register_mark_available(vcpu, reg);
2269 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2272 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2274 case VCPU_EXREG_PDPTR:
2276 ept_save_pdptrs(vcpu);
2278 case VCPU_EXREG_CR0:
2279 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2281 vcpu->arch.cr0 &= ~guest_owned_bits;
2282 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2284 case VCPU_EXREG_CR3:
2285 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
2286 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2288 case VCPU_EXREG_CR4:
2289 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2291 vcpu->arch.cr4 &= ~guest_owned_bits;
2292 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2300 static __init int cpu_has_kvm_support(void)
2302 return cpu_has_vmx();
2305 static __init int vmx_disabled_by_bios(void)
2307 return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2308 !boot_cpu_has(X86_FEATURE_VMX);
2311 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2315 cr4_set_bits(X86_CR4_VMXE);
2316 intel_pt_handle_vmx(1);
2318 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2319 _ASM_EXTABLE(1b, %l[fault])
2320 : : [vmxon_pointer] "m"(vmxon_pointer)
2325 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2326 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2327 intel_pt_handle_vmx(0);
2328 cr4_clear_bits(X86_CR4_VMXE);
2333 static int hardware_enable(void)
2335 int cpu = raw_smp_processor_id();
2336 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2339 if (cr4_read_shadow() & X86_CR4_VMXE)
2343 * This can happen if we hot-added a CPU but failed to allocate
2344 * VP assist page for it.
2346 if (static_branch_unlikely(&enable_evmcs) &&
2347 !hv_get_vp_assist_page(cpu))
2350 r = kvm_cpu_vmxon(phys_addr);
2360 static void vmclear_local_loaded_vmcss(void)
2362 int cpu = raw_smp_processor_id();
2363 struct loaded_vmcs *v, *n;
2365 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2366 loaded_vmcss_on_cpu_link)
2367 __loaded_vmcs_clear(v);
2371 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2374 static void kvm_cpu_vmxoff(void)
2376 asm volatile (__ex("vmxoff"));
2378 intel_pt_handle_vmx(0);
2379 cr4_clear_bits(X86_CR4_VMXE);
2382 static void hardware_disable(void)
2384 vmclear_local_loaded_vmcss();
2389 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2390 * directly instead of going through cpu_has(), to ensure KVM is trapping
2391 * ENCLS whenever it's supported in hardware. It does not matter whether
2392 * the host OS supports or has enabled SGX.
2394 static bool cpu_has_sgx(void)
2396 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2399 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2400 u32 msr, u32 *result)
2402 u32 vmx_msr_low, vmx_msr_high;
2403 u32 ctl = ctl_min | ctl_opt;
2405 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2407 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2408 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2410 /* Ensure minimum (required) set of control bits are supported. */
2418 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2419 struct vmx_capability *vmx_cap)
2421 u32 vmx_msr_low, vmx_msr_high;
2422 u32 min, opt, min2, opt2;
2423 u32 _pin_based_exec_control = 0;
2424 u32 _cpu_based_exec_control = 0;
2425 u32 _cpu_based_2nd_exec_control = 0;
2426 u32 _vmexit_control = 0;
2427 u32 _vmentry_control = 0;
2429 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2430 min = CPU_BASED_HLT_EXITING |
2431 #ifdef CONFIG_X86_64
2432 CPU_BASED_CR8_LOAD_EXITING |
2433 CPU_BASED_CR8_STORE_EXITING |
2435 CPU_BASED_CR3_LOAD_EXITING |
2436 CPU_BASED_CR3_STORE_EXITING |
2437 CPU_BASED_UNCOND_IO_EXITING |
2438 CPU_BASED_MOV_DR_EXITING |
2439 CPU_BASED_USE_TSC_OFFSETTING |
2440 CPU_BASED_MWAIT_EXITING |
2441 CPU_BASED_MONITOR_EXITING |
2442 CPU_BASED_INVLPG_EXITING |
2443 CPU_BASED_RDPMC_EXITING;
2445 opt = CPU_BASED_TPR_SHADOW |
2446 CPU_BASED_USE_MSR_BITMAPS |
2447 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2448 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2449 &_cpu_based_exec_control) < 0)
2451 #ifdef CONFIG_X86_64
2452 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2453 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2454 ~CPU_BASED_CR8_STORE_EXITING;
2456 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2458 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2459 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2460 SECONDARY_EXEC_WBINVD_EXITING |
2461 SECONDARY_EXEC_ENABLE_VPID |
2462 SECONDARY_EXEC_ENABLE_EPT |
2463 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2464 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2465 SECONDARY_EXEC_DESC |
2466 SECONDARY_EXEC_RDTSCP |
2467 SECONDARY_EXEC_ENABLE_INVPCID |
2468 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2469 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2470 SECONDARY_EXEC_SHADOW_VMCS |
2471 SECONDARY_EXEC_XSAVES |
2472 SECONDARY_EXEC_RDSEED_EXITING |
2473 SECONDARY_EXEC_RDRAND_EXITING |
2474 SECONDARY_EXEC_ENABLE_PML |
2475 SECONDARY_EXEC_TSC_SCALING |
2476 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2477 SECONDARY_EXEC_PT_USE_GPA |
2478 SECONDARY_EXEC_PT_CONCEAL_VMX |
2479 SECONDARY_EXEC_ENABLE_VMFUNC;
2481 opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
2482 if (adjust_vmx_controls(min2, opt2,
2483 MSR_IA32_VMX_PROCBASED_CTLS2,
2484 &_cpu_based_2nd_exec_control) < 0)
2487 #ifndef CONFIG_X86_64
2488 if (!(_cpu_based_2nd_exec_control &
2489 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2490 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2493 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2494 _cpu_based_2nd_exec_control &= ~(
2495 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2496 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2497 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2499 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2500 &vmx_cap->ept, &vmx_cap->vpid);
2502 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2503 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2505 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2506 CPU_BASED_CR3_STORE_EXITING |
2507 CPU_BASED_INVLPG_EXITING);
2508 } else if (vmx_cap->ept) {
2510 pr_warn_once("EPT CAP should not exist if not support "
2511 "1-setting enable EPT VM-execution control\n");
2513 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2516 pr_warn_once("VPID CAP should not exist if not support "
2517 "1-setting enable VPID VM-execution control\n");
2520 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2521 #ifdef CONFIG_X86_64
2522 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2524 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2525 VM_EXIT_LOAD_IA32_PAT |
2526 VM_EXIT_LOAD_IA32_EFER |
2527 VM_EXIT_CLEAR_BNDCFGS |
2528 VM_EXIT_PT_CONCEAL_PIP |
2529 VM_EXIT_CLEAR_IA32_RTIT_CTL;
2530 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2531 &_vmexit_control) < 0)
2534 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2535 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2536 PIN_BASED_VMX_PREEMPTION_TIMER;
2537 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2538 &_pin_based_exec_control) < 0)
2541 if (cpu_has_broken_vmx_preemption_timer())
2542 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2543 if (!(_cpu_based_2nd_exec_control &
2544 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2545 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2547 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2548 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2549 VM_ENTRY_LOAD_IA32_PAT |
2550 VM_ENTRY_LOAD_IA32_EFER |
2551 VM_ENTRY_LOAD_BNDCFGS |
2552 VM_ENTRY_PT_CONCEAL_PIP |
2553 VM_ENTRY_LOAD_IA32_RTIT_CTL;
2554 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2555 &_vmentry_control) < 0)
2559 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2560 * can't be used due to an errata where VM Exit may incorrectly clear
2561 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2562 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2564 if (boot_cpu_data.x86 == 0x6) {
2565 switch (boot_cpu_data.x86_model) {
2566 case 26: /* AAK155 */
2567 case 30: /* AAP115 */
2568 case 37: /* AAT100 */
2569 case 44: /* BC86,AAY89,BD102 */
2571 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2572 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2573 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2574 "does not work properly. Using workaround\n");
2582 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2584 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2585 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2588 #ifdef CONFIG_X86_64
2589 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2590 if (vmx_msr_high & (1u<<16))
2594 /* Require Write-Back (WB) memory type for VMCS accesses. */
2595 if (((vmx_msr_high >> 18) & 15) != 6)
2598 vmcs_conf->size = vmx_msr_high & 0x1fff;
2599 vmcs_conf->order = get_order(vmcs_conf->size);
2600 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2602 vmcs_conf->revision_id = vmx_msr_low;
2604 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2605 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2606 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2607 vmcs_conf->vmexit_ctrl = _vmexit_control;
2608 vmcs_conf->vmentry_ctrl = _vmentry_control;
2610 if (static_branch_unlikely(&enable_evmcs))
2611 evmcs_sanitize_exec_ctrls(vmcs_conf);
2616 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2618 int node = cpu_to_node(cpu);
2622 pages = __alloc_pages_node(node, flags, vmcs_config.order);
2625 vmcs = page_address(pages);
2626 memset(vmcs, 0, vmcs_config.size);
2628 /* KVM supports Enlightened VMCS v1 only */
2629 if (static_branch_unlikely(&enable_evmcs))
2630 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2632 vmcs->hdr.revision_id = vmcs_config.revision_id;
2635 vmcs->hdr.shadow_vmcs = 1;
2639 void free_vmcs(struct vmcs *vmcs)
2641 free_pages((unsigned long)vmcs, vmcs_config.order);
2645 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2647 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2649 if (!loaded_vmcs->vmcs)
2651 loaded_vmcs_clear(loaded_vmcs);
2652 free_vmcs(loaded_vmcs->vmcs);
2653 loaded_vmcs->vmcs = NULL;
2654 if (loaded_vmcs->msr_bitmap)
2655 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2656 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2659 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2661 loaded_vmcs->vmcs = alloc_vmcs(false);
2662 if (!loaded_vmcs->vmcs)
2665 vmcs_clear(loaded_vmcs->vmcs);
2667 loaded_vmcs->shadow_vmcs = NULL;
2668 loaded_vmcs->hv_timer_soft_disabled = false;
2669 loaded_vmcs->cpu = -1;
2670 loaded_vmcs->launched = 0;
2672 if (cpu_has_vmx_msr_bitmap()) {
2673 loaded_vmcs->msr_bitmap = (unsigned long *)
2674 __get_free_page(GFP_KERNEL_ACCOUNT);
2675 if (!loaded_vmcs->msr_bitmap)
2677 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2679 if (IS_ENABLED(CONFIG_HYPERV) &&
2680 static_branch_unlikely(&enable_evmcs) &&
2681 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2682 struct hv_enlightened_vmcs *evmcs =
2683 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2685 evmcs->hv_enlightenments_control.msr_bitmap = 1;
2689 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2690 memset(&loaded_vmcs->controls_shadow, 0,
2691 sizeof(struct vmcs_controls_shadow));
2696 free_loaded_vmcs(loaded_vmcs);
2700 static void free_kvm_area(void)
2704 for_each_possible_cpu(cpu) {
2705 free_vmcs(per_cpu(vmxarea, cpu));
2706 per_cpu(vmxarea, cpu) = NULL;
2710 static __init int alloc_kvm_area(void)
2714 for_each_possible_cpu(cpu) {
2717 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2724 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2725 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2726 * revision_id reported by MSR_IA32_VMX_BASIC.
2728 * However, even though not explicitly documented by
2729 * TLFS, VMXArea passed as VMXON argument should
2730 * still be marked with revision_id reported by
2733 if (static_branch_unlikely(&enable_evmcs))
2734 vmcs->hdr.revision_id = vmcs_config.revision_id;
2736 per_cpu(vmxarea, cpu) = vmcs;
2741 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2742 struct kvm_segment *save)
2744 if (!emulate_invalid_guest_state) {
2746 * CS and SS RPL should be equal during guest entry according
2747 * to VMX spec, but in reality it is not always so. Since vcpu
2748 * is in the middle of the transition from real mode to
2749 * protected mode it is safe to assume that RPL 0 is a good
2752 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2753 save->selector &= ~SEGMENT_RPL_MASK;
2754 save->dpl = save->selector & SEGMENT_RPL_MASK;
2757 vmx_set_segment(vcpu, save, seg);
2760 static void enter_pmode(struct kvm_vcpu *vcpu)
2762 unsigned long flags;
2763 struct vcpu_vmx *vmx = to_vmx(vcpu);
2766 * Update real mode segment cache. It may be not up-to-date if sement
2767 * register was written while vcpu was in a guest mode.
2769 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2770 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2771 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2772 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2773 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2774 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2776 vmx->rmode.vm86_active = 0;
2778 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2780 flags = vmcs_readl(GUEST_RFLAGS);
2781 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2782 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2783 vmcs_writel(GUEST_RFLAGS, flags);
2785 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2786 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2788 update_exception_bitmap(vcpu);
2790 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2791 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2792 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2793 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2794 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2795 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2798 static void fix_rmode_seg(int seg, struct kvm_segment *save)
2800 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2801 struct kvm_segment var = *save;
2804 if (seg == VCPU_SREG_CS)
2807 if (!emulate_invalid_guest_state) {
2808 var.selector = var.base >> 4;
2809 var.base = var.base & 0xffff0;
2819 if (save->base & 0xf)
2820 printk_once(KERN_WARNING "kvm: segment base is not "
2821 "paragraph aligned when entering "
2822 "protected mode (seg=%d)", seg);
2825 vmcs_write16(sf->selector, var.selector);
2826 vmcs_writel(sf->base, var.base);
2827 vmcs_write32(sf->limit, var.limit);
2828 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2831 static void enter_rmode(struct kvm_vcpu *vcpu)
2833 unsigned long flags;
2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2835 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2837 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2838 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2839 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2840 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2841 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2842 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2843 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2845 vmx->rmode.vm86_active = 1;
2848 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2849 * vcpu. Warn the user that an update is overdue.
2851 if (!kvm_vmx->tss_addr)
2852 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2853 "called before entering vcpu\n");
2855 vmx_segment_cache_clear(vmx);
2857 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2858 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2859 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2861 flags = vmcs_readl(GUEST_RFLAGS);
2862 vmx->rmode.save_rflags = flags;
2864 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2866 vmcs_writel(GUEST_RFLAGS, flags);
2867 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2868 update_exception_bitmap(vcpu);
2870 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2871 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2872 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2873 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2874 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2875 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2877 kvm_mmu_reset_context(vcpu);
2880 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2882 struct vcpu_vmx *vmx = to_vmx(vcpu);
2883 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2888 vcpu->arch.efer = efer;
2889 if (efer & EFER_LMA) {
2890 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2893 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2895 msr->data = efer & ~EFER_LME;
2900 #ifdef CONFIG_X86_64
2902 static void enter_lmode(struct kvm_vcpu *vcpu)
2906 vmx_segment_cache_clear(to_vmx(vcpu));
2908 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2909 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2910 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2912 vmcs_write32(GUEST_TR_AR_BYTES,
2913 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2914 | VMX_AR_TYPE_BUSY_64_TSS);
2916 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2919 static void exit_lmode(struct kvm_vcpu *vcpu)
2921 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2922 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2927 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
2929 struct vcpu_vmx *vmx = to_vmx(vcpu);
2932 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
2933 * the CPU is not required to invalidate guest-physical mappings on
2934 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
2935 * associated with the root EPT structure and not any particular VPID
2936 * (INVVPID also isn't required to invalidate guest-physical mappings).
2940 } else if (enable_vpid) {
2941 if (cpu_has_vmx_invvpid_global()) {
2942 vpid_sync_vcpu_global();
2944 vpid_sync_vcpu_single(vmx->vpid);
2945 vpid_sync_vcpu_single(vmx->nested.vpid02);
2950 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
2952 struct kvm_mmu *mmu = vcpu->arch.mmu;
2953 u64 root_hpa = mmu->root_hpa;
2955 /* No flush required if the current context is invalid. */
2956 if (!VALID_PAGE(root_hpa))
2960 ept_sync_context(construct_eptp(vcpu, root_hpa,
2961 mmu->shadow_root_level));
2962 else if (!is_guest_mode(vcpu))
2963 vpid_sync_context(to_vmx(vcpu)->vpid);
2965 vpid_sync_context(nested_get_vpid02(vcpu));
2968 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2971 * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
2972 * vmx_flush_tlb_guest() for an explanation of why this is ok.
2974 vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
2977 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
2980 * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
2981 * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
2982 * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
2983 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
2984 * i.e. no explicit INVVPID is necessary.
2986 vpid_sync_context(to_vmx(vcpu)->vpid);
2989 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
2991 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2993 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
2996 if (is_pae_paging(vcpu)) {
2997 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2998 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2999 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3000 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3004 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3006 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3008 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3011 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3012 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3013 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3014 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3016 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
3019 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3021 struct kvm_vcpu *vcpu)
3023 struct vcpu_vmx *vmx = to_vmx(vcpu);
3025 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3026 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3027 if (!(cr0 & X86_CR0_PG)) {
3028 /* From paging/starting to nonpaging */
3029 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3030 CPU_BASED_CR3_STORE_EXITING);
3031 vcpu->arch.cr0 = cr0;
3032 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3033 } else if (!is_paging(vcpu)) {
3034 /* From nonpaging to paging */
3035 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3036 CPU_BASED_CR3_STORE_EXITING);
3037 vcpu->arch.cr0 = cr0;
3038 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3041 if (!(cr0 & X86_CR0_WP))
3042 *hw_cr0 &= ~X86_CR0_WP;
3045 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3047 struct vcpu_vmx *vmx = to_vmx(vcpu);
3048 unsigned long hw_cr0;
3050 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3051 if (enable_unrestricted_guest)
3052 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3054 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3056 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3059 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3063 #ifdef CONFIG_X86_64
3064 if (vcpu->arch.efer & EFER_LME) {
3065 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3067 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3072 if (enable_ept && !enable_unrestricted_guest)
3073 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3075 vmcs_writel(CR0_READ_SHADOW, cr0);
3076 vmcs_writel(GUEST_CR0, hw_cr0);
3077 vcpu->arch.cr0 = cr0;
3078 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3080 /* depends on vcpu->arch.cr0 to be set to a new value */
3081 vmx->emulation_required = emulation_required(vcpu);
3084 static int vmx_get_max_tdp_level(void)
3086 if (cpu_has_vmx_ept_5levels())
3091 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
3094 u64 eptp = VMX_EPTP_MT_WB;
3096 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3098 if (enable_ept_ad_bits &&
3099 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3100 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3101 eptp |= (root_hpa & PAGE_MASK);
3106 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
3109 struct kvm *kvm = vcpu->kvm;
3110 bool update_guest_cr3 = true;
3111 unsigned long guest_cr3;
3115 eptp = construct_eptp(vcpu, pgd, pgd_level);
3116 vmcs_write64(EPT_POINTER, eptp);
3118 if (kvm_x86_ops.tlb_remote_flush) {
3119 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3120 to_vmx(vcpu)->ept_pointer = eptp;
3121 to_kvm_vmx(kvm)->ept_pointers_match
3122 = EPT_POINTERS_CHECK;
3123 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3126 if (!enable_unrestricted_guest && !is_paging(vcpu))
3127 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3128 else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3129 guest_cr3 = vcpu->arch.cr3;
3130 else /* vmcs01.GUEST_CR3 is already up-to-date. */
3131 update_guest_cr3 = false;
3132 vmx_ept_load_pdptrs(vcpu);
3137 if (update_guest_cr3)
3138 vmcs_writel(GUEST_CR3, guest_cr3);
3141 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3143 struct vcpu_vmx *vmx = to_vmx(vcpu);
3145 * Pass through host's Machine Check Enable value to hw_cr4, which
3146 * is in force while we are in guest mode. Do not let guests control
3147 * this bit, even if host CR4.MCE == 0.
3149 unsigned long hw_cr4;
3151 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3152 if (enable_unrestricted_guest)
3153 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3154 else if (vmx->rmode.vm86_active)
3155 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3157 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3159 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3160 if (cr4 & X86_CR4_UMIP) {
3161 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3162 hw_cr4 &= ~X86_CR4_UMIP;
3163 } else if (!is_guest_mode(vcpu) ||
3164 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3165 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3169 if (cr4 & X86_CR4_VMXE) {
3171 * To use VMXON (and later other VMX instructions), a guest
3172 * must first be able to turn on cr4.VMXE (see handle_vmon()).
3173 * So basically the check on whether to allow nested VMX
3174 * is here. We operate under the default treatment of SMM,
3175 * so VMX cannot be enabled under SMM.
3177 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
3181 if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3184 vcpu->arch.cr4 = cr4;
3185 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3187 if (!enable_unrestricted_guest) {
3189 if (!is_paging(vcpu)) {
3190 hw_cr4 &= ~X86_CR4_PAE;
3191 hw_cr4 |= X86_CR4_PSE;
3192 } else if (!(cr4 & X86_CR4_PAE)) {
3193 hw_cr4 &= ~X86_CR4_PAE;
3198 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3199 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3200 * to be manually disabled when guest switches to non-paging
3203 * If !enable_unrestricted_guest, the CPU is always running
3204 * with CR0.PG=1 and CR4 needs to be modified.
3205 * If enable_unrestricted_guest, the CPU automatically
3206 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3208 if (!is_paging(vcpu))
3209 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3212 vmcs_writel(CR4_READ_SHADOW, cr4);
3213 vmcs_writel(GUEST_CR4, hw_cr4);
3217 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3219 struct vcpu_vmx *vmx = to_vmx(vcpu);
3222 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3223 *var = vmx->rmode.segs[seg];
3224 if (seg == VCPU_SREG_TR
3225 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3227 var->base = vmx_read_guest_seg_base(vmx, seg);
3228 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3231 var->base = vmx_read_guest_seg_base(vmx, seg);
3232 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3233 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3234 ar = vmx_read_guest_seg_ar(vmx, seg);
3235 var->unusable = (ar >> 16) & 1;
3236 var->type = ar & 15;
3237 var->s = (ar >> 4) & 1;
3238 var->dpl = (ar >> 5) & 3;
3240 * Some userspaces do not preserve unusable property. Since usable
3241 * segment has to be present according to VMX spec we can use present
3242 * property to amend userspace bug by making unusable segment always
3243 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3244 * segment as unusable.
3246 var->present = !var->unusable;
3247 var->avl = (ar >> 12) & 1;
3248 var->l = (ar >> 13) & 1;
3249 var->db = (ar >> 14) & 1;
3250 var->g = (ar >> 15) & 1;
3253 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3255 struct kvm_segment s;
3257 if (to_vmx(vcpu)->rmode.vm86_active) {
3258 vmx_get_segment(vcpu, &s, seg);
3261 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3264 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3266 struct vcpu_vmx *vmx = to_vmx(vcpu);
3268 if (unlikely(vmx->rmode.vm86_active))
3271 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3272 return VMX_AR_DPL(ar);
3276 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3280 if (var->unusable || !var->present)
3283 ar = var->type & 15;
3284 ar |= (var->s & 1) << 4;
3285 ar |= (var->dpl & 3) << 5;
3286 ar |= (var->present & 1) << 7;
3287 ar |= (var->avl & 1) << 12;
3288 ar |= (var->l & 1) << 13;
3289 ar |= (var->db & 1) << 14;
3290 ar |= (var->g & 1) << 15;
3296 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3298 struct vcpu_vmx *vmx = to_vmx(vcpu);
3299 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3301 vmx_segment_cache_clear(vmx);
3303 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3304 vmx->rmode.segs[seg] = *var;
3305 if (seg == VCPU_SREG_TR)
3306 vmcs_write16(sf->selector, var->selector);
3308 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3312 vmcs_writel(sf->base, var->base);
3313 vmcs_write32(sf->limit, var->limit);
3314 vmcs_write16(sf->selector, var->selector);
3317 * Fix the "Accessed" bit in AR field of segment registers for older
3319 * IA32 arch specifies that at the time of processor reset the
3320 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3321 * is setting it to 0 in the userland code. This causes invalid guest
3322 * state vmexit when "unrestricted guest" mode is turned on.
3323 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3324 * tree. Newer qemu binaries with that qemu fix would not need this
3327 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3328 var->type |= 0x1; /* Accessed */
3330 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3333 vmx->emulation_required = emulation_required(vcpu);
3336 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3338 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3340 *db = (ar >> 14) & 1;
3341 *l = (ar >> 13) & 1;
3344 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3346 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3347 dt->address = vmcs_readl(GUEST_IDTR_BASE);
3350 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3352 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3353 vmcs_writel(GUEST_IDTR_BASE, dt->address);
3356 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3358 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3359 dt->address = vmcs_readl(GUEST_GDTR_BASE);
3362 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3364 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3365 vmcs_writel(GUEST_GDTR_BASE, dt->address);
3368 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3370 struct kvm_segment var;
3373 vmx_get_segment(vcpu, &var, seg);
3375 if (seg == VCPU_SREG_CS)
3377 ar = vmx_segment_access_rights(&var);
3379 if (var.base != (var.selector << 4))
3381 if (var.limit != 0xffff)
3389 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3391 struct kvm_segment cs;
3392 unsigned int cs_rpl;
3394 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3395 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3399 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3403 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3404 if (cs.dpl > cs_rpl)
3407 if (cs.dpl != cs_rpl)
3413 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3417 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3419 struct kvm_segment ss;
3420 unsigned int ss_rpl;
3422 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3423 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3427 if (ss.type != 3 && ss.type != 7)