kvm: x86: Set highest physical address bits in non-present/reserved SPTEs
[muen/linux.git] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include "irq.h"
20 #include "mmu.h"
21 #include "cpuid.h"
22 #include "lapic.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/sched.h>
30 #include <linux/moduleparam.h>
31 #include <linux/mod_devicetable.h>
32 #include <linux/trace_events.h>
33 #include <linux/slab.h>
34 #include <linux/tboot.h>
35 #include <linux/hrtimer.h>
36 #include <linux/frame.h>
37 #include <linux/nospec.h>
38 #include "kvm_cache_regs.h"
39 #include "x86.h"
40
41 #include <asm/asm.h>
42 #include <asm/cpu.h>
43 #include <asm/io.h>
44 #include <asm/desc.h>
45 #include <asm/vmx.h>
46 #include <asm/virtext.h>
47 #include <asm/mce.h>
48 #include <asm/fpu/internal.h>
49 #include <asm/perf_event.h>
50 #include <asm/debugreg.h>
51 #include <asm/kexec.h>
52 #include <asm/apic.h>
53 #include <asm/irq_remapping.h>
54 #include <asm/mmu_context.h>
55 #include <asm/spec-ctrl.h>
56 #include <asm/mshyperv.h>
57
58 #include "trace.h"
59 #include "pmu.h"
60 #include "vmx_evmcs.h"
61
62 #define __ex(x) __kvm_handle_fault_on_reboot(x)
63 #define __ex_clear(x, reg) \
64         ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
65
66 MODULE_AUTHOR("Qumranet");
67 MODULE_LICENSE("GPL");
68
69 static const struct x86_cpu_id vmx_cpu_id[] = {
70         X86_FEATURE_MATCH(X86_FEATURE_VMX),
71         {}
72 };
73 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
74
75 static bool __read_mostly enable_vpid = 1;
76 module_param_named(vpid, enable_vpid, bool, 0444);
77
78 static bool __read_mostly enable_vnmi = 1;
79 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
80
81 static bool __read_mostly flexpriority_enabled = 1;
82 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
83
84 static bool __read_mostly enable_ept = 1;
85 module_param_named(ept, enable_ept, bool, S_IRUGO);
86
87 static bool __read_mostly enable_unrestricted_guest = 1;
88 module_param_named(unrestricted_guest,
89                         enable_unrestricted_guest, bool, S_IRUGO);
90
91 static bool __read_mostly enable_ept_ad_bits = 1;
92 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
93
94 static bool __read_mostly emulate_invalid_guest_state = true;
95 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
96
97 static bool __read_mostly fasteoi = 1;
98 module_param(fasteoi, bool, S_IRUGO);
99
100 static bool __read_mostly enable_apicv = 1;
101 module_param(enable_apicv, bool, S_IRUGO);
102
103 static bool __read_mostly enable_shadow_vmcs = 1;
104 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
105 /*
106  * If nested=1, nested virtualization is supported, i.e., guests may use
107  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
108  * use VMX instructions.
109  */
110 static bool __read_mostly nested = 0;
111 module_param(nested, bool, S_IRUGO);
112
113 static u64 __read_mostly host_xss;
114
115 static bool __read_mostly enable_pml = 1;
116 module_param_named(pml, enable_pml, bool, S_IRUGO);
117
118 #define MSR_TYPE_R      1
119 #define MSR_TYPE_W      2
120 #define MSR_TYPE_RW     3
121
122 #define MSR_BITMAP_MODE_X2APIC          1
123 #define MSR_BITMAP_MODE_X2APIC_APICV    2
124 #define MSR_BITMAP_MODE_LM              4
125
126 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
127
128 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
129 static int __read_mostly cpu_preemption_timer_multi;
130 static bool __read_mostly enable_preemption_timer = 1;
131 #ifdef CONFIG_X86_64
132 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
133 #endif
134
135 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
136 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
137 #define KVM_VM_CR0_ALWAYS_ON                            \
138         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST |      \
139          X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
140 #define KVM_CR4_GUEST_OWNED_BITS                                      \
141         (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
142          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
143
144 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
145 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
146 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
147
148 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
149
150 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
151
152 /*
153  * Hyper-V requires all of these, so mark them as supported even though
154  * they are just treated the same as all-context.
155  */
156 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
157         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
158         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
159         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
160         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
161
162 /*
163  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
164  * ple_gap:    upper bound on the amount of time between two successive
165  *             executions of PAUSE in a loop. Also indicate if ple enabled.
166  *             According to test, this time is usually smaller than 128 cycles.
167  * ple_window: upper bound on the amount of time a guest is allowed to execute
168  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
169  *             less than 2^12 cycles
170  * Time is measured based on a counter that runs at the same rate as the TSC,
171  * refer SDM volume 3b section 21.6.13 & 22.1.3.
172  */
173 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
174
175 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
176 module_param(ple_window, uint, 0444);
177
178 /* Default doubles per-vcpu window every exit. */
179 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
180 module_param(ple_window_grow, uint, 0444);
181
182 /* Default resets per-vcpu window every exit to ple_window. */
183 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
184 module_param(ple_window_shrink, uint, 0444);
185
186 /* Default is to compute the maximum so we can never overflow. */
187 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
188 module_param(ple_window_max, uint, 0444);
189
190 extern const ulong vmx_return;
191
192 enum ept_pointers_status {
193         EPT_POINTERS_CHECK = 0,
194         EPT_POINTERS_MATCH = 1,
195         EPT_POINTERS_MISMATCH = 2
196 };
197
198 struct kvm_vmx {
199         struct kvm kvm;
200
201         unsigned int tss_addr;
202         bool ept_identity_pagetable_done;
203         gpa_t ept_identity_map_addr;
204
205         enum ept_pointers_status ept_pointers_match;
206         spinlock_t ept_pointer_lock;
207 };
208
209 #define NR_AUTOLOAD_MSRS 8
210
211 struct vmcs_hdr {
212         u32 revision_id:31;
213         u32 shadow_vmcs:1;
214 };
215
216 struct vmcs {
217         struct vmcs_hdr hdr;
218         u32 abort;
219         char data[0];
220 };
221
222 /*
223  * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
224  * and whose values change infrequently, but are not constant.  I.e. this is
225  * used as a write-through cache of the corresponding VMCS fields.
226  */
227 struct vmcs_host_state {
228         unsigned long cr3;      /* May not match real cr3 */
229         unsigned long cr4;      /* May not match real cr4 */
230         unsigned long gs_base;
231         unsigned long fs_base;
232
233         u16           fs_sel, gs_sel, ldt_sel;
234 #ifdef CONFIG_X86_64
235         u16           ds_sel, es_sel;
236 #endif
237 };
238
239 /*
240  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
241  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
242  * loaded on this CPU (so we can clear them if the CPU goes down).
243  */
244 struct loaded_vmcs {
245         struct vmcs *vmcs;
246         struct vmcs *shadow_vmcs;
247         int cpu;
248         bool launched;
249         bool nmi_known_unmasked;
250         /* Support for vnmi-less CPUs */
251         int soft_vnmi_blocked;
252         ktime_t entry_time;
253         s64 vnmi_blocked_time;
254         unsigned long *msr_bitmap;
255         struct list_head loaded_vmcss_on_cpu_link;
256         struct vmcs_host_state host_state;
257 };
258
259 struct shared_msr_entry {
260         unsigned index;
261         u64 data;
262         u64 mask;
263 };
264
265 /*
266  * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
267  * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
268  * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
269  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
270  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
271  * More than one of these structures may exist, if L1 runs multiple L2 guests.
272  * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
273  * underlying hardware which will be used to run L2.
274  * This structure is packed to ensure that its layout is identical across
275  * machines (necessary for live migration).
276  *
277  * IMPORTANT: Changing the layout of existing fields in this structure
278  * will break save/restore compatibility with older kvm releases. When
279  * adding new fields, either use space in the reserved padding* arrays
280  * or add the new fields to the end of the structure.
281  */
282 typedef u64 natural_width;
283 struct __packed vmcs12 {
284         /* According to the Intel spec, a VMCS region must start with the
285          * following two fields. Then follow implementation-specific data.
286          */
287         struct vmcs_hdr hdr;
288         u32 abort;
289
290         u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
291         u32 padding[7]; /* room for future expansion */
292
293         u64 io_bitmap_a;
294         u64 io_bitmap_b;
295         u64 msr_bitmap;
296         u64 vm_exit_msr_store_addr;
297         u64 vm_exit_msr_load_addr;
298         u64 vm_entry_msr_load_addr;
299         u64 tsc_offset;
300         u64 virtual_apic_page_addr;
301         u64 apic_access_addr;
302         u64 posted_intr_desc_addr;
303         u64 ept_pointer;
304         u64 eoi_exit_bitmap0;
305         u64 eoi_exit_bitmap1;
306         u64 eoi_exit_bitmap2;
307         u64 eoi_exit_bitmap3;
308         u64 xss_exit_bitmap;
309         u64 guest_physical_address;
310         u64 vmcs_link_pointer;
311         u64 guest_ia32_debugctl;
312         u64 guest_ia32_pat;
313         u64 guest_ia32_efer;
314         u64 guest_ia32_perf_global_ctrl;
315         u64 guest_pdptr0;
316         u64 guest_pdptr1;
317         u64 guest_pdptr2;
318         u64 guest_pdptr3;
319         u64 guest_bndcfgs;
320         u64 host_ia32_pat;
321         u64 host_ia32_efer;
322         u64 host_ia32_perf_global_ctrl;
323         u64 vmread_bitmap;
324         u64 vmwrite_bitmap;
325         u64 vm_function_control;
326         u64 eptp_list_address;
327         u64 pml_address;
328         u64 padding64[3]; /* room for future expansion */
329         /*
330          * To allow migration of L1 (complete with its L2 guests) between
331          * machines of different natural widths (32 or 64 bit), we cannot have
332          * unsigned long fields with no explict size. We use u64 (aliased
333          * natural_width) instead. Luckily, x86 is little-endian.
334          */
335         natural_width cr0_guest_host_mask;
336         natural_width cr4_guest_host_mask;
337         natural_width cr0_read_shadow;
338         natural_width cr4_read_shadow;
339         natural_width cr3_target_value0;
340         natural_width cr3_target_value1;
341         natural_width cr3_target_value2;
342         natural_width cr3_target_value3;
343         natural_width exit_qualification;
344         natural_width guest_linear_address;
345         natural_width guest_cr0;
346         natural_width guest_cr3;
347         natural_width guest_cr4;
348         natural_width guest_es_base;
349         natural_width guest_cs_base;
350         natural_width guest_ss_base;
351         natural_width guest_ds_base;
352         natural_width guest_fs_base;
353         natural_width guest_gs_base;
354         natural_width guest_ldtr_base;
355         natural_width guest_tr_base;
356         natural_width guest_gdtr_base;
357         natural_width guest_idtr_base;
358         natural_width guest_dr7;
359         natural_width guest_rsp;
360         natural_width guest_rip;
361         natural_width guest_rflags;
362         natural_width guest_pending_dbg_exceptions;
363         natural_width guest_sysenter_esp;
364         natural_width guest_sysenter_eip;
365         natural_width host_cr0;
366         natural_width host_cr3;
367         natural_width host_cr4;
368         natural_width host_fs_base;
369         natural_width host_gs_base;
370         natural_width host_tr_base;
371         natural_width host_gdtr_base;
372         natural_width host_idtr_base;
373         natural_width host_ia32_sysenter_esp;
374         natural_width host_ia32_sysenter_eip;
375         natural_width host_rsp;
376         natural_width host_rip;
377         natural_width paddingl[8]; /* room for future expansion */
378         u32 pin_based_vm_exec_control;
379         u32 cpu_based_vm_exec_control;
380         u32 exception_bitmap;
381         u32 page_fault_error_code_mask;
382         u32 page_fault_error_code_match;
383         u32 cr3_target_count;
384         u32 vm_exit_controls;
385         u32 vm_exit_msr_store_count;
386         u32 vm_exit_msr_load_count;
387         u32 vm_entry_controls;
388         u32 vm_entry_msr_load_count;
389         u32 vm_entry_intr_info_field;
390         u32 vm_entry_exception_error_code;
391         u32 vm_entry_instruction_len;
392         u32 tpr_threshold;
393         u32 secondary_vm_exec_control;
394         u32 vm_instruction_error;
395         u32 vm_exit_reason;
396         u32 vm_exit_intr_info;
397         u32 vm_exit_intr_error_code;
398         u32 idt_vectoring_info_field;
399         u32 idt_vectoring_error_code;
400         u32 vm_exit_instruction_len;
401         u32 vmx_instruction_info;
402         u32 guest_es_limit;
403         u32 guest_cs_limit;
404         u32 guest_ss_limit;
405         u32 guest_ds_limit;
406         u32 guest_fs_limit;
407         u32 guest_gs_limit;
408         u32 guest_ldtr_limit;
409         u32 guest_tr_limit;
410         u32 guest_gdtr_limit;
411         u32 guest_idtr_limit;
412         u32 guest_es_ar_bytes;
413         u32 guest_cs_ar_bytes;
414         u32 guest_ss_ar_bytes;
415         u32 guest_ds_ar_bytes;
416         u32 guest_fs_ar_bytes;
417         u32 guest_gs_ar_bytes;
418         u32 guest_ldtr_ar_bytes;
419         u32 guest_tr_ar_bytes;
420         u32 guest_interruptibility_info;
421         u32 guest_activity_state;
422         u32 guest_sysenter_cs;
423         u32 host_ia32_sysenter_cs;
424         u32 vmx_preemption_timer_value;
425         u32 padding32[7]; /* room for future expansion */
426         u16 virtual_processor_id;
427         u16 posted_intr_nv;
428         u16 guest_es_selector;
429         u16 guest_cs_selector;
430         u16 guest_ss_selector;
431         u16 guest_ds_selector;
432         u16 guest_fs_selector;
433         u16 guest_gs_selector;
434         u16 guest_ldtr_selector;
435         u16 guest_tr_selector;
436         u16 guest_intr_status;
437         u16 host_es_selector;
438         u16 host_cs_selector;
439         u16 host_ss_selector;
440         u16 host_ds_selector;
441         u16 host_fs_selector;
442         u16 host_gs_selector;
443         u16 host_tr_selector;
444         u16 guest_pml_index;
445 };
446
447 /*
448  * For save/restore compatibility, the vmcs12 field offsets must not change.
449  */
450 #define CHECK_OFFSET(field, loc)                                \
451         BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc),       \
452                 "Offset of " #field " in struct vmcs12 has changed.")
453
454 static inline void vmx_check_vmcs12_offsets(void) {
455         CHECK_OFFSET(hdr, 0);
456         CHECK_OFFSET(abort, 4);
457         CHECK_OFFSET(launch_state, 8);
458         CHECK_OFFSET(io_bitmap_a, 40);
459         CHECK_OFFSET(io_bitmap_b, 48);
460         CHECK_OFFSET(msr_bitmap, 56);
461         CHECK_OFFSET(vm_exit_msr_store_addr, 64);
462         CHECK_OFFSET(vm_exit_msr_load_addr, 72);
463         CHECK_OFFSET(vm_entry_msr_load_addr, 80);
464         CHECK_OFFSET(tsc_offset, 88);
465         CHECK_OFFSET(virtual_apic_page_addr, 96);
466         CHECK_OFFSET(apic_access_addr, 104);
467         CHECK_OFFSET(posted_intr_desc_addr, 112);
468         CHECK_OFFSET(ept_pointer, 120);
469         CHECK_OFFSET(eoi_exit_bitmap0, 128);
470         CHECK_OFFSET(eoi_exit_bitmap1, 136);
471         CHECK_OFFSET(eoi_exit_bitmap2, 144);
472         CHECK_OFFSET(eoi_exit_bitmap3, 152);
473         CHECK_OFFSET(xss_exit_bitmap, 160);
474         CHECK_OFFSET(guest_physical_address, 168);
475         CHECK_OFFSET(vmcs_link_pointer, 176);
476         CHECK_OFFSET(guest_ia32_debugctl, 184);
477         CHECK_OFFSET(guest_ia32_pat, 192);
478         CHECK_OFFSET(guest_ia32_efer, 200);
479         CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
480         CHECK_OFFSET(guest_pdptr0, 216);
481         CHECK_OFFSET(guest_pdptr1, 224);
482         CHECK_OFFSET(guest_pdptr2, 232);
483         CHECK_OFFSET(guest_pdptr3, 240);
484         CHECK_OFFSET(guest_bndcfgs, 248);
485         CHECK_OFFSET(host_ia32_pat, 256);
486         CHECK_OFFSET(host_ia32_efer, 264);
487         CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
488         CHECK_OFFSET(vmread_bitmap, 280);
489         CHECK_OFFSET(vmwrite_bitmap, 288);
490         CHECK_OFFSET(vm_function_control, 296);
491         CHECK_OFFSET(eptp_list_address, 304);
492         CHECK_OFFSET(pml_address, 312);
493         CHECK_OFFSET(cr0_guest_host_mask, 344);
494         CHECK_OFFSET(cr4_guest_host_mask, 352);
495         CHECK_OFFSET(cr0_read_shadow, 360);
496         CHECK_OFFSET(cr4_read_shadow, 368);
497         CHECK_OFFSET(cr3_target_value0, 376);
498         CHECK_OFFSET(cr3_target_value1, 384);
499         CHECK_OFFSET(cr3_target_value2, 392);
500         CHECK_OFFSET(cr3_target_value3, 400);
501         CHECK_OFFSET(exit_qualification, 408);
502         CHECK_OFFSET(guest_linear_address, 416);
503         CHECK_OFFSET(guest_cr0, 424);
504         CHECK_OFFSET(guest_cr3, 432);
505         CHECK_OFFSET(guest_cr4, 440);
506         CHECK_OFFSET(guest_es_base, 448);
507         CHECK_OFFSET(guest_cs_base, 456);
508         CHECK_OFFSET(guest_ss_base, 464);
509         CHECK_OFFSET(guest_ds_base, 472);
510         CHECK_OFFSET(guest_fs_base, 480);
511         CHECK_OFFSET(guest_gs_base, 488);
512         CHECK_OFFSET(guest_ldtr_base, 496);
513         CHECK_OFFSET(guest_tr_base, 504);
514         CHECK_OFFSET(guest_gdtr_base, 512);
515         CHECK_OFFSET(guest_idtr_base, 520);
516         CHECK_OFFSET(guest_dr7, 528);
517         CHECK_OFFSET(guest_rsp, 536);
518         CHECK_OFFSET(guest_rip, 544);
519         CHECK_OFFSET(guest_rflags, 552);
520         CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
521         CHECK_OFFSET(guest_sysenter_esp, 568);
522         CHECK_OFFSET(guest_sysenter_eip, 576);
523         CHECK_OFFSET(host_cr0, 584);
524         CHECK_OFFSET(host_cr3, 592);
525         CHECK_OFFSET(host_cr4, 600);
526         CHECK_OFFSET(host_fs_base, 608);
527         CHECK_OFFSET(host_gs_base, 616);
528         CHECK_OFFSET(host_tr_base, 624);
529         CHECK_OFFSET(host_gdtr_base, 632);
530         CHECK_OFFSET(host_idtr_base, 640);
531         CHECK_OFFSET(host_ia32_sysenter_esp, 648);
532         CHECK_OFFSET(host_ia32_sysenter_eip, 656);
533         CHECK_OFFSET(host_rsp, 664);
534         CHECK_OFFSET(host_rip, 672);
535         CHECK_OFFSET(pin_based_vm_exec_control, 744);
536         CHECK_OFFSET(cpu_based_vm_exec_control, 748);
537         CHECK_OFFSET(exception_bitmap, 752);
538         CHECK_OFFSET(page_fault_error_code_mask, 756);
539         CHECK_OFFSET(page_fault_error_code_match, 760);
540         CHECK_OFFSET(cr3_target_count, 764);
541         CHECK_OFFSET(vm_exit_controls, 768);
542         CHECK_OFFSET(vm_exit_msr_store_count, 772);
543         CHECK_OFFSET(vm_exit_msr_load_count, 776);
544         CHECK_OFFSET(vm_entry_controls, 780);
545         CHECK_OFFSET(vm_entry_msr_load_count, 784);
546         CHECK_OFFSET(vm_entry_intr_info_field, 788);
547         CHECK_OFFSET(vm_entry_exception_error_code, 792);
548         CHECK_OFFSET(vm_entry_instruction_len, 796);
549         CHECK_OFFSET(tpr_threshold, 800);
550         CHECK_OFFSET(secondary_vm_exec_control, 804);
551         CHECK_OFFSET(vm_instruction_error, 808);
552         CHECK_OFFSET(vm_exit_reason, 812);
553         CHECK_OFFSET(vm_exit_intr_info, 816);
554         CHECK_OFFSET(vm_exit_intr_error_code, 820);
555         CHECK_OFFSET(idt_vectoring_info_field, 824);
556         CHECK_OFFSET(idt_vectoring_error_code, 828);
557         CHECK_OFFSET(vm_exit_instruction_len, 832);
558         CHECK_OFFSET(vmx_instruction_info, 836);
559         CHECK_OFFSET(guest_es_limit, 840);
560         CHECK_OFFSET(guest_cs_limit, 844);
561         CHECK_OFFSET(guest_ss_limit, 848);
562         CHECK_OFFSET(guest_ds_limit, 852);
563         CHECK_OFFSET(guest_fs_limit, 856);
564         CHECK_OFFSET(guest_gs_limit, 860);
565         CHECK_OFFSET(guest_ldtr_limit, 864);
566         CHECK_OFFSET(guest_tr_limit, 868);
567         CHECK_OFFSET(guest_gdtr_limit, 872);
568         CHECK_OFFSET(guest_idtr_limit, 876);
569         CHECK_OFFSET(guest_es_ar_bytes, 880);
570         CHECK_OFFSET(guest_cs_ar_bytes, 884);
571         CHECK_OFFSET(guest_ss_ar_bytes, 888);
572         CHECK_OFFSET(guest_ds_ar_bytes, 892);
573         CHECK_OFFSET(guest_fs_ar_bytes, 896);
574         CHECK_OFFSET(guest_gs_ar_bytes, 900);
575         CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
576         CHECK_OFFSET(guest_tr_ar_bytes, 908);
577         CHECK_OFFSET(guest_interruptibility_info, 912);
578         CHECK_OFFSET(guest_activity_state, 916);
579         CHECK_OFFSET(guest_sysenter_cs, 920);
580         CHECK_OFFSET(host_ia32_sysenter_cs, 924);
581         CHECK_OFFSET(vmx_preemption_timer_value, 928);
582         CHECK_OFFSET(virtual_processor_id, 960);
583         CHECK_OFFSET(posted_intr_nv, 962);
584         CHECK_OFFSET(guest_es_selector, 964);
585         CHECK_OFFSET(guest_cs_selector, 966);
586         CHECK_OFFSET(guest_ss_selector, 968);
587         CHECK_OFFSET(guest_ds_selector, 970);
588         CHECK_OFFSET(guest_fs_selector, 972);
589         CHECK_OFFSET(guest_gs_selector, 974);
590         CHECK_OFFSET(guest_ldtr_selector, 976);
591         CHECK_OFFSET(guest_tr_selector, 978);
592         CHECK_OFFSET(guest_intr_status, 980);
593         CHECK_OFFSET(host_es_selector, 982);
594         CHECK_OFFSET(host_cs_selector, 984);
595         CHECK_OFFSET(host_ss_selector, 986);
596         CHECK_OFFSET(host_ds_selector, 988);
597         CHECK_OFFSET(host_fs_selector, 990);
598         CHECK_OFFSET(host_gs_selector, 992);
599         CHECK_OFFSET(host_tr_selector, 994);
600         CHECK_OFFSET(guest_pml_index, 996);
601 }
602
603 /*
604  * VMCS12_REVISION is an arbitrary id that should be changed if the content or
605  * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
606  * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
607  *
608  * IMPORTANT: Changing this value will break save/restore compatibility with
609  * older kvm releases.
610  */
611 #define VMCS12_REVISION 0x11e57ed0
612
613 /*
614  * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
615  * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
616  * current implementation, 4K are reserved to avoid future complications.
617  */
618 #define VMCS12_SIZE 0x1000
619
620 /*
621  * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
622  * supported VMCS12 field encoding.
623  */
624 #define VMCS12_MAX_FIELD_INDEX 0x17
625
626 struct nested_vmx_msrs {
627         /*
628          * We only store the "true" versions of the VMX capability MSRs. We
629          * generate the "non-true" versions by setting the must-be-1 bits
630          * according to the SDM.
631          */
632         u32 procbased_ctls_low;
633         u32 procbased_ctls_high;
634         u32 secondary_ctls_low;
635         u32 secondary_ctls_high;
636         u32 pinbased_ctls_low;
637         u32 pinbased_ctls_high;
638         u32 exit_ctls_low;
639         u32 exit_ctls_high;
640         u32 entry_ctls_low;
641         u32 entry_ctls_high;
642         u32 misc_low;
643         u32 misc_high;
644         u32 ept_caps;
645         u32 vpid_caps;
646         u64 basic;
647         u64 cr0_fixed0;
648         u64 cr0_fixed1;
649         u64 cr4_fixed0;
650         u64 cr4_fixed1;
651         u64 vmcs_enum;
652         u64 vmfunc_controls;
653 };
654
655 /*
656  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
657  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
658  */
659 struct nested_vmx {
660         /* Has the level1 guest done vmxon? */
661         bool vmxon;
662         gpa_t vmxon_ptr;
663         bool pml_full;
664
665         /* The guest-physical address of the current VMCS L1 keeps for L2 */
666         gpa_t current_vmptr;
667         /*
668          * Cache of the guest's VMCS, existing outside of guest memory.
669          * Loaded from guest memory during VMPTRLD. Flushed to guest
670          * memory during VMCLEAR and VMPTRLD.
671          */
672         struct vmcs12 *cached_vmcs12;
673         /*
674          * Cache of the guest's shadow VMCS, existing outside of guest
675          * memory. Loaded from guest memory during VM entry. Flushed
676          * to guest memory during VM exit.
677          */
678         struct vmcs12 *cached_shadow_vmcs12;
679         /*
680          * Indicates if the shadow vmcs must be updated with the
681          * data hold by vmcs12
682          */
683         bool sync_shadow_vmcs;
684         bool dirty_vmcs12;
685
686         bool change_vmcs01_virtual_apic_mode;
687
688         /* L2 must run next, and mustn't decide to exit to L1. */
689         bool nested_run_pending;
690
691         struct loaded_vmcs vmcs02;
692
693         /*
694          * Guest pages referred to in the vmcs02 with host-physical
695          * pointers, so we must keep them pinned while L2 runs.
696          */
697         struct page *apic_access_page;
698         struct page *virtual_apic_page;
699         struct page *pi_desc_page;
700         struct pi_desc *pi_desc;
701         bool pi_pending;
702         u16 posted_intr_nv;
703
704         struct hrtimer preemption_timer;
705         bool preemption_timer_expired;
706
707         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
708         u64 vmcs01_debugctl;
709
710         u16 vpid02;
711         u16 last_vpid;
712
713         struct nested_vmx_msrs msrs;
714
715         /* SMM related state */
716         struct {
717                 /* in VMX operation on SMM entry? */
718                 bool vmxon;
719                 /* in guest mode on SMM entry? */
720                 bool guest_mode;
721         } smm;
722 };
723
724 #define POSTED_INTR_ON  0
725 #define POSTED_INTR_SN  1
726
727 /* Posted-Interrupt Descriptor */
728 struct pi_desc {
729         u32 pir[8];     /* Posted interrupt requested */
730         union {
731                 struct {
732                                 /* bit 256 - Outstanding Notification */
733                         u16     on      : 1,
734                                 /* bit 257 - Suppress Notification */
735                                 sn      : 1,
736                                 /* bit 271:258 - Reserved */
737                                 rsvd_1  : 14;
738                                 /* bit 279:272 - Notification Vector */
739                         u8      nv;
740                                 /* bit 287:280 - Reserved */
741                         u8      rsvd_2;
742                                 /* bit 319:288 - Notification Destination */
743                         u32     ndst;
744                 };
745                 u64 control;
746         };
747         u32 rsvd[6];
748 } __aligned(64);
749
750 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
751 {
752         return test_and_set_bit(POSTED_INTR_ON,
753                         (unsigned long *)&pi_desc->control);
754 }
755
756 static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
757 {
758         return test_and_clear_bit(POSTED_INTR_ON,
759                         (unsigned long *)&pi_desc->control);
760 }
761
762 static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
763 {
764         return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
765 }
766
767 static inline void pi_clear_sn(struct pi_desc *pi_desc)
768 {
769         return clear_bit(POSTED_INTR_SN,
770                         (unsigned long *)&pi_desc->control);
771 }
772
773 static inline void pi_set_sn(struct pi_desc *pi_desc)
774 {
775         return set_bit(POSTED_INTR_SN,
776                         (unsigned long *)&pi_desc->control);
777 }
778
779 static inline void pi_clear_on(struct pi_desc *pi_desc)
780 {
781         clear_bit(POSTED_INTR_ON,
782                   (unsigned long *)&pi_desc->control);
783 }
784
785 static inline int pi_test_on(struct pi_desc *pi_desc)
786 {
787         return test_bit(POSTED_INTR_ON,
788                         (unsigned long *)&pi_desc->control);
789 }
790
791 static inline int pi_test_sn(struct pi_desc *pi_desc)
792 {
793         return test_bit(POSTED_INTR_SN,
794                         (unsigned long *)&pi_desc->control);
795 }
796
797 struct vcpu_vmx {
798         struct kvm_vcpu       vcpu;
799         unsigned long         host_rsp;
800         u8                    fail;
801         u8                    msr_bitmap_mode;
802         u32                   exit_intr_info;
803         u32                   idt_vectoring_info;
804         ulong                 rflags;
805         struct shared_msr_entry *guest_msrs;
806         int                   nmsrs;
807         int                   save_nmsrs;
808         unsigned long         host_idt_base;
809 #ifdef CONFIG_X86_64
810         u64                   msr_host_kernel_gs_base;
811         u64                   msr_guest_kernel_gs_base;
812 #endif
813
814         u64                   arch_capabilities;
815         u64                   spec_ctrl;
816
817         u32 vm_entry_controls_shadow;
818         u32 vm_exit_controls_shadow;
819         u32 secondary_exec_control;
820
821         /*
822          * loaded_vmcs points to the VMCS currently used in this vcpu. For a
823          * non-nested (L1) guest, it always points to vmcs01. For a nested
824          * guest (L2), it points to a different VMCS.  loaded_cpu_state points
825          * to the VMCS whose state is loaded into the CPU registers that only
826          * need to be switched when transitioning to/from the kernel; a NULL
827          * value indicates that host state is loaded.
828          */
829         struct loaded_vmcs    vmcs01;
830         struct loaded_vmcs   *loaded_vmcs;
831         struct loaded_vmcs   *loaded_cpu_state;
832         bool                  __launched; /* temporary, used in vmx_vcpu_run */
833         struct msr_autoload {
834                 unsigned nr;
835                 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
836                 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
837         } msr_autoload;
838
839         struct {
840                 int vm86_active;
841                 ulong save_rflags;
842                 struct kvm_segment segs[8];
843         } rmode;
844         struct {
845                 u32 bitmask; /* 4 bits per segment (1 bit per field) */
846                 struct kvm_save_segment {
847                         u16 selector;
848                         unsigned long base;
849                         u32 limit;
850                         u32 ar;
851                 } seg[8];
852         } segment_cache;
853         int vpid;
854         bool emulation_required;
855
856         u32 exit_reason;
857
858         /* Posted interrupt descriptor */
859         struct pi_desc pi_desc;
860
861         /* Support for a guest hypervisor (nested VMX) */
862         struct nested_vmx nested;
863
864         /* Dynamic PLE window. */
865         int ple_window;
866         bool ple_window_dirty;
867
868         /* Support for PML */
869 #define PML_ENTITY_NUM          512
870         struct page *pml_pg;
871
872         /* apic deadline value in host tsc */
873         u64 hv_deadline_tsc;
874
875         u64 current_tsc_ratio;
876
877         u32 host_pkru;
878
879         unsigned long host_debugctlmsr;
880
881         /*
882          * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
883          * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
884          * in msr_ia32_feature_control_valid_bits.
885          */
886         u64 msr_ia32_feature_control;
887         u64 msr_ia32_feature_control_valid_bits;
888         u64 ept_pointer;
889 };
890
891 enum segment_cache_field {
892         SEG_FIELD_SEL = 0,
893         SEG_FIELD_BASE = 1,
894         SEG_FIELD_LIMIT = 2,
895         SEG_FIELD_AR = 3,
896
897         SEG_FIELD_NR = 4
898 };
899
900 static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
901 {
902         return container_of(kvm, struct kvm_vmx, kvm);
903 }
904
905 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
906 {
907         return container_of(vcpu, struct vcpu_vmx, vcpu);
908 }
909
910 static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
911 {
912         return &(to_vmx(vcpu)->pi_desc);
913 }
914
915 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
916 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
917 #define FIELD(number, name)     [ROL16(number, 6)] = VMCS12_OFFSET(name)
918 #define FIELD64(number, name)                                           \
919         FIELD(number, name),                                            \
920         [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
921
922
923 static u16 shadow_read_only_fields[] = {
924 #define SHADOW_FIELD_RO(x) x,
925 #include "vmx_shadow_fields.h"
926 };
927 static int max_shadow_read_only_fields =
928         ARRAY_SIZE(shadow_read_only_fields);
929
930 static u16 shadow_read_write_fields[] = {
931 #define SHADOW_FIELD_RW(x) x,
932 #include "vmx_shadow_fields.h"
933 };
934 static int max_shadow_read_write_fields =
935         ARRAY_SIZE(shadow_read_write_fields);
936
937 static const unsigned short vmcs_field_to_offset_table[] = {
938         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
939         FIELD(POSTED_INTR_NV, posted_intr_nv),
940         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
941         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
942         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
943         FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
944         FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
945         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
946         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
947         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
948         FIELD(GUEST_INTR_STATUS, guest_intr_status),
949         FIELD(GUEST_PML_INDEX, guest_pml_index),
950         FIELD(HOST_ES_SELECTOR, host_es_selector),
951         FIELD(HOST_CS_SELECTOR, host_cs_selector),
952         FIELD(HOST_SS_SELECTOR, host_ss_selector),
953         FIELD(HOST_DS_SELECTOR, host_ds_selector),
954         FIELD(HOST_FS_SELECTOR, host_fs_selector),
955         FIELD(HOST_GS_SELECTOR, host_gs_selector),
956         FIELD(HOST_TR_SELECTOR, host_tr_selector),
957         FIELD64(IO_BITMAP_A, io_bitmap_a),
958         FIELD64(IO_BITMAP_B, io_bitmap_b),
959         FIELD64(MSR_BITMAP, msr_bitmap),
960         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
961         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
962         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
963         FIELD64(PML_ADDRESS, pml_address),
964         FIELD64(TSC_OFFSET, tsc_offset),
965         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
966         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
967         FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
968         FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
969         FIELD64(EPT_POINTER, ept_pointer),
970         FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
971         FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
972         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
973         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
974         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
975         FIELD64(VMREAD_BITMAP, vmread_bitmap),
976         FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
977         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
978         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
979         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
980         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
981         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
982         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
983         FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
984         FIELD64(GUEST_PDPTR0, guest_pdptr0),
985         FIELD64(GUEST_PDPTR1, guest_pdptr1),
986         FIELD64(GUEST_PDPTR2, guest_pdptr2),
987         FIELD64(GUEST_PDPTR3, guest_pdptr3),
988         FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
989         FIELD64(HOST_IA32_PAT, host_ia32_pat),
990         FIELD64(HOST_IA32_EFER, host_ia32_efer),
991         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
992         FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
993         FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
994         FIELD(EXCEPTION_BITMAP, exception_bitmap),
995         FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
996         FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
997         FIELD(CR3_TARGET_COUNT, cr3_target_count),
998         FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
999         FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
1000         FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
1001         FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
1002         FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
1003         FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
1004         FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
1005         FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
1006         FIELD(TPR_THRESHOLD, tpr_threshold),
1007         FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
1008         FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
1009         FIELD(VM_EXIT_REASON, vm_exit_reason),
1010         FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
1011         FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
1012         FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
1013         FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
1014         FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
1015         FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
1016         FIELD(GUEST_ES_LIMIT, guest_es_limit),
1017         FIELD(GUEST_CS_LIMIT, guest_cs_limit),
1018         FIELD(GUEST_SS_LIMIT, guest_ss_limit),
1019         FIELD(GUEST_DS_LIMIT, guest_ds_limit),
1020         FIELD(GUEST_FS_LIMIT, guest_fs_limit),
1021         FIELD(GUEST_GS_LIMIT, guest_gs_limit),
1022         FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
1023         FIELD(GUEST_TR_LIMIT, guest_tr_limit),
1024         FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
1025         FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
1026         FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
1027         FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
1028         FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
1029         FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
1030         FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
1031         FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
1032         FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
1033         FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
1034         FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
1035         FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
1036         FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
1037         FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
1038         FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
1039         FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
1040         FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
1041         FIELD(CR0_READ_SHADOW, cr0_read_shadow),
1042         FIELD(CR4_READ_SHADOW, cr4_read_shadow),
1043         FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
1044         FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
1045         FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
1046         FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
1047         FIELD(EXIT_QUALIFICATION, exit_qualification),
1048         FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
1049         FIELD(GUEST_CR0, guest_cr0),
1050         FIELD(GUEST_CR3, guest_cr3),
1051         FIELD(GUEST_CR4, guest_cr4),
1052         FIELD(GUEST_ES_BASE, guest_es_base),
1053         FIELD(GUEST_CS_BASE, guest_cs_base),
1054         FIELD(GUEST_SS_BASE, guest_ss_base),
1055         FIELD(GUEST_DS_BASE, guest_ds_base),
1056         FIELD(GUEST_FS_BASE, guest_fs_base),
1057         FIELD(GUEST_GS_BASE, guest_gs_base),
1058         FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
1059         FIELD(GUEST_TR_BASE, guest_tr_base),
1060         FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
1061         FIELD(GUEST_IDTR_BASE, guest_idtr_base),
1062         FIELD(GUEST_DR7, guest_dr7),
1063         FIELD(GUEST_RSP, guest_rsp),
1064         FIELD(GUEST_RIP, guest_rip),
1065         FIELD(GUEST_RFLAGS, guest_rflags),
1066         FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
1067         FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
1068         FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
1069         FIELD(HOST_CR0, host_cr0),
1070         FIELD(HOST_CR3, host_cr3),
1071         FIELD(HOST_CR4, host_cr4),
1072         FIELD(HOST_FS_BASE, host_fs_base),
1073         FIELD(HOST_GS_BASE, host_gs_base),
1074         FIELD(HOST_TR_BASE, host_tr_base),
1075         FIELD(HOST_GDTR_BASE, host_gdtr_base),
1076         FIELD(HOST_IDTR_BASE, host_idtr_base),
1077         FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
1078         FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
1079         FIELD(HOST_RSP, host_rsp),
1080         FIELD(HOST_RIP, host_rip),
1081 };
1082
1083 static inline short vmcs_field_to_offset(unsigned long field)
1084 {
1085         const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1086         unsigned short offset;
1087         unsigned index;
1088
1089         if (field >> 15)
1090                 return -ENOENT;
1091
1092         index = ROL16(field, 6);
1093         if (index >= size)
1094                 return -ENOENT;
1095
1096         index = array_index_nospec(index, size);
1097         offset = vmcs_field_to_offset_table[index];
1098         if (offset == 0)
1099                 return -ENOENT;
1100         return offset;
1101 }
1102
1103 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1104 {
1105         return to_vmx(vcpu)->nested.cached_vmcs12;
1106 }
1107
1108 static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
1109 {
1110         return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
1111 }
1112
1113 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
1114 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
1115 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
1116 static bool vmx_xsaves_supported(void);
1117 static void vmx_set_segment(struct kvm_vcpu *vcpu,
1118                             struct kvm_segment *var, int seg);
1119 static void vmx_get_segment(struct kvm_vcpu *vcpu,
1120                             struct kvm_segment *var, int seg);
1121 static bool guest_state_valid(struct kvm_vcpu *vcpu);
1122 static u32 vmx_segment_access_rights(struct kvm_segment *var);
1123 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
1124 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1125 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1126 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1127                                             u16 error_code);
1128 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1129 static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1130                                                           u32 msr, int type);
1131
1132 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1133 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1134 /*
1135  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1136  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1137  */
1138 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
1139
1140 /*
1141  * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1142  * can find which vCPU should be waken up.
1143  */
1144 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1145 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1146
1147 enum {
1148         VMX_VMREAD_BITMAP,
1149         VMX_VMWRITE_BITMAP,
1150         VMX_BITMAP_NR
1151 };
1152
1153 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1154
1155 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
1156 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
1157
1158 static bool cpu_has_load_ia32_efer;
1159 static bool cpu_has_load_perf_global_ctrl;
1160
1161 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1162 static DEFINE_SPINLOCK(vmx_vpid_lock);
1163
1164 static struct vmcs_config {
1165         int size;
1166         int order;
1167         u32 basic_cap;
1168         u32 revision_id;
1169         u32 pin_based_exec_ctrl;
1170         u32 cpu_based_exec_ctrl;
1171         u32 cpu_based_2nd_exec_ctrl;
1172         u32 vmexit_ctrl;
1173         u32 vmentry_ctrl;
1174         struct nested_vmx_msrs nested;
1175 } vmcs_config;
1176
1177 static struct vmx_capability {
1178         u32 ept;
1179         u32 vpid;
1180 } vmx_capability;
1181
1182 #define VMX_SEGMENT_FIELD(seg)                                  \
1183         [VCPU_SREG_##seg] = {                                   \
1184                 .selector = GUEST_##seg##_SELECTOR,             \
1185                 .base = GUEST_##seg##_BASE,                     \
1186                 .limit = GUEST_##seg##_LIMIT,                   \
1187                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
1188         }
1189
1190 static const struct kvm_vmx_segment_field {
1191         unsigned selector;
1192         unsigned base;
1193         unsigned limit;
1194         unsigned ar_bytes;
1195 } kvm_vmx_segment_fields[] = {
1196         VMX_SEGMENT_FIELD(CS),
1197         VMX_SEGMENT_FIELD(DS),
1198         VMX_SEGMENT_FIELD(ES),
1199         VMX_SEGMENT_FIELD(FS),
1200         VMX_SEGMENT_FIELD(GS),
1201         VMX_SEGMENT_FIELD(SS),
1202         VMX_SEGMENT_FIELD(TR),
1203         VMX_SEGMENT_FIELD(LDTR),
1204 };
1205
1206 static u64 host_efer;
1207
1208 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1209
1210 /*
1211  * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1212  * away by decrementing the array size.
1213  */
1214 static const u32 vmx_msr_index[] = {
1215 #ifdef CONFIG_X86_64
1216         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1217 #endif
1218         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1219 };
1220
1221 DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1222
1223 #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1224
1225 #define KVM_EVMCS_VERSION 1
1226
1227 #if IS_ENABLED(CONFIG_HYPERV)
1228 static bool __read_mostly enlightened_vmcs = true;
1229 module_param(enlightened_vmcs, bool, 0444);
1230
1231 static inline void evmcs_write64(unsigned long field, u64 value)
1232 {
1233         u16 clean_field;
1234         int offset = get_evmcs_offset(field, &clean_field);
1235
1236         if (offset < 0)
1237                 return;
1238
1239         *(u64 *)((char *)current_evmcs + offset) = value;
1240
1241         current_evmcs->hv_clean_fields &= ~clean_field;
1242 }
1243
1244 static inline void evmcs_write32(unsigned long field, u32 value)
1245 {
1246         u16 clean_field;
1247         int offset = get_evmcs_offset(field, &clean_field);
1248
1249         if (offset < 0)
1250                 return;
1251
1252         *(u32 *)((char *)current_evmcs + offset) = value;
1253         current_evmcs->hv_clean_fields &= ~clean_field;
1254 }
1255
1256 static inline void evmcs_write16(unsigned long field, u16 value)
1257 {
1258         u16 clean_field;
1259         int offset = get_evmcs_offset(field, &clean_field);
1260
1261         if (offset < 0)
1262                 return;
1263
1264         *(u16 *)((char *)current_evmcs + offset) = value;
1265         current_evmcs->hv_clean_fields &= ~clean_field;
1266 }
1267
1268 static inline u64 evmcs_read64(unsigned long field)
1269 {
1270         int offset = get_evmcs_offset(field, NULL);
1271
1272         if (offset < 0)
1273                 return 0;
1274
1275         return *(u64 *)((char *)current_evmcs + offset);
1276 }
1277
1278 static inline u32 evmcs_read32(unsigned long field)
1279 {
1280         int offset = get_evmcs_offset(field, NULL);
1281
1282         if (offset < 0)
1283                 return 0;
1284
1285         return *(u32 *)((char *)current_evmcs + offset);
1286 }
1287
1288 static inline u16 evmcs_read16(unsigned long field)
1289 {
1290         int offset = get_evmcs_offset(field, NULL);
1291
1292         if (offset < 0)
1293                 return 0;
1294
1295         return *(u16 *)((char *)current_evmcs + offset);
1296 }
1297
1298 static inline void evmcs_touch_msr_bitmap(void)
1299 {
1300         if (unlikely(!current_evmcs))
1301                 return;
1302
1303         if (current_evmcs->hv_enlightenments_control.msr_bitmap)
1304                 current_evmcs->hv_clean_fields &=
1305                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
1306 }
1307
1308 static void evmcs_load(u64 phys_addr)
1309 {
1310         struct hv_vp_assist_page *vp_ap =
1311                 hv_get_vp_assist_page(smp_processor_id());
1312
1313         vp_ap->current_nested_vmcs = phys_addr;
1314         vp_ap->enlighten_vmentry = 1;
1315 }
1316
1317 static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1318 {
1319         /*
1320          * Enlightened VMCSv1 doesn't support these:
1321          *
1322          *      POSTED_INTR_NV                  = 0x00000002,
1323          *      GUEST_INTR_STATUS               = 0x00000810,
1324          *      APIC_ACCESS_ADDR                = 0x00002014,
1325          *      POSTED_INTR_DESC_ADDR           = 0x00002016,
1326          *      EOI_EXIT_BITMAP0                = 0x0000201c,
1327          *      EOI_EXIT_BITMAP1                = 0x0000201e,
1328          *      EOI_EXIT_BITMAP2                = 0x00002020,
1329          *      EOI_EXIT_BITMAP3                = 0x00002022,
1330          */
1331         vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
1332         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1333                 ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1334         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1335                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1336         vmcs_conf->cpu_based_2nd_exec_ctrl &=
1337                 ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
1338
1339         /*
1340          *      GUEST_PML_INDEX                 = 0x00000812,
1341          *      PML_ADDRESS                     = 0x0000200e,
1342          */
1343         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
1344
1345         /*      VM_FUNCTION_CONTROL             = 0x00002018, */
1346         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
1347
1348         /*
1349          *      EPTP_LIST_ADDRESS               = 0x00002024,
1350          *      VMREAD_BITMAP                   = 0x00002026,
1351          *      VMWRITE_BITMAP                  = 0x00002028,
1352          */
1353         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
1354
1355         /*
1356          *      TSC_MULTIPLIER                  = 0x00002032,
1357          */
1358         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
1359
1360         /*
1361          *      PLE_GAP                         = 0x00004020,
1362          *      PLE_WINDOW                      = 0x00004022,
1363          */
1364         vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1365
1366         /*
1367          *      VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
1368          */
1369         vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
1370
1371         /*
1372          *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
1373          *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
1374          */
1375         vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
1376         vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
1377
1378         /*
1379          * Currently unsupported in KVM:
1380          *      GUEST_IA32_RTIT_CTL             = 0x00002814,
1381          */
1382 }
1383
1384 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
1385 static void check_ept_pointer_match(struct kvm *kvm)
1386 {
1387         struct kvm_vcpu *vcpu;
1388         u64 tmp_eptp = INVALID_PAGE;
1389         int i;
1390
1391         kvm_for_each_vcpu(i, vcpu, kvm) {
1392                 if (!VALID_PAGE(tmp_eptp)) {
1393                         tmp_eptp = to_vmx(vcpu)->ept_pointer;
1394                 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1395                         to_kvm_vmx(kvm)->ept_pointers_match
1396                                 = EPT_POINTERS_MISMATCH;
1397                         return;
1398                 }
1399         }
1400
1401         to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1402 }
1403
1404 static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
1405 {
1406         int ret;
1407
1408         spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1409
1410         if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1411                 check_ept_pointer_match(kvm);
1412
1413         if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
1414                 ret = -ENOTSUPP;
1415                 goto out;
1416         }
1417
1418         ret = hyperv_flush_guest_mapping(
1419                         to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
1420
1421 out:
1422         spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1423         return ret;
1424 }
1425 #else /* !IS_ENABLED(CONFIG_HYPERV) */
1426 static inline void evmcs_write64(unsigned long field, u64 value) {}
1427 static inline void evmcs_write32(unsigned long field, u32 value) {}
1428 static inline void evmcs_write16(unsigned long field, u16 value) {}
1429 static inline u64 evmcs_read64(unsigned long field) { return 0; }
1430 static inline u32 evmcs_read32(unsigned long field) { return 0; }
1431 static inline u16 evmcs_read16(unsigned long field) { return 0; }
1432 static inline void evmcs_load(u64 phys_addr) {}
1433 static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
1434 static inline void evmcs_touch_msr_bitmap(void) {}
1435 #endif /* IS_ENABLED(CONFIG_HYPERV) */
1436
1437 static inline bool is_exception_n(u32 intr_info, u8 vector)
1438 {
1439         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1440                              INTR_INFO_VALID_MASK)) ==
1441                 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1442 }
1443
1444 static inline bool is_debug(u32 intr_info)
1445 {
1446         return is_exception_n(intr_info, DB_VECTOR);
1447 }
1448
1449 static inline bool is_breakpoint(u32 intr_info)
1450 {
1451         return is_exception_n(intr_info, BP_VECTOR);
1452 }
1453
1454 static inline bool is_page_fault(u32 intr_info)
1455 {
1456         return is_exception_n(intr_info, PF_VECTOR);
1457 }
1458
1459 static inline bool is_no_device(u32 intr_info)
1460 {
1461         return is_exception_n(intr_info, NM_VECTOR);
1462 }
1463
1464 static inline bool is_invalid_opcode(u32 intr_info)
1465 {
1466         return is_exception_n(intr_info, UD_VECTOR);
1467 }
1468
1469 static inline bool is_gp_fault(u32 intr_info)
1470 {
1471         return is_exception_n(intr_info, GP_VECTOR);
1472 }
1473
1474 static inline bool is_external_interrupt(u32 intr_info)
1475 {
1476         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1477                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1478 }
1479
1480 static inline bool is_machine_check(u32 intr_info)
1481 {
1482         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1483                              INTR_INFO_VALID_MASK)) ==
1484                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1485 }
1486
1487 /* Undocumented: icebp/int1 */
1488 static inline bool is_icebp(u32 intr_info)
1489 {
1490         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1491                 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1492 }
1493
1494 static inline bool cpu_has_vmx_msr_bitmap(void)
1495 {
1496         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1497 }
1498
1499 static inline bool cpu_has_vmx_tpr_shadow(void)
1500 {
1501         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1502 }
1503
1504 static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1505 {
1506         return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1507 }
1508
1509 static inline bool cpu_has_secondary_exec_ctrls(void)
1510 {
1511         return vmcs_config.cpu_based_exec_ctrl &
1512                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1513 }
1514
1515 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1516 {
1517         return vmcs_config.cpu_based_2nd_exec_ctrl &
1518                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1519 }
1520
1521 static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1522 {
1523         return vmcs_config.cpu_based_2nd_exec_ctrl &
1524                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1525 }
1526
1527 static inline bool cpu_has_vmx_apic_register_virt(void)
1528 {
1529         return vmcs_config.cpu_based_2nd_exec_ctrl &
1530                 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1531 }
1532
1533 static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1534 {
1535         return vmcs_config.cpu_based_2nd_exec_ctrl &
1536                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1537 }
1538
1539 /*
1540  * Comment's format: document - errata name - stepping - processor name.
1541  * Refer from
1542  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1543  */
1544 static u32 vmx_preemption_cpu_tfms[] = {
1545 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
1546 0x000206E6,
1547 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
1548 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1549 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1550 0x00020652,
1551 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1552 0x00020655,
1553 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
1554 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
1555 /*
1556  * 320767.pdf - AAP86  - B1 -
1557  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1558  */
1559 0x000106E5,
1560 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1561 0x000106A0,
1562 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1563 0x000106A1,
1564 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1565 0x000106A4,
1566  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1567  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1568  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1569 0x000106A5,
1570 };
1571
1572 static inline bool cpu_has_broken_vmx_preemption_timer(void)
1573 {
1574         u32 eax = cpuid_eax(0x00000001), i;
1575
1576         /* Clear the reserved bits */
1577         eax &= ~(0x3U << 14 | 0xfU << 28);
1578         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1579                 if (eax == vmx_preemption_cpu_tfms[i])
1580                         return true;
1581
1582         return false;
1583 }
1584
1585 static inline bool cpu_has_vmx_preemption_timer(void)
1586 {
1587         return vmcs_config.pin_based_exec_ctrl &
1588                 PIN_BASED_VMX_PREEMPTION_TIMER;
1589 }
1590
1591 static inline bool cpu_has_vmx_posted_intr(void)
1592 {
1593         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1594                 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1595 }
1596
1597 static inline bool cpu_has_vmx_apicv(void)
1598 {
1599         return cpu_has_vmx_apic_register_virt() &&
1600                 cpu_has_vmx_virtual_intr_delivery() &&
1601                 cpu_has_vmx_posted_intr();
1602 }
1603
1604 static inline bool cpu_has_vmx_flexpriority(void)
1605 {
1606         return cpu_has_vmx_tpr_shadow() &&
1607                 cpu_has_vmx_virtualize_apic_accesses();
1608 }
1609
1610 static inline bool cpu_has_vmx_ept_execute_only(void)
1611 {
1612         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1613 }
1614
1615 static inline bool cpu_has_vmx_ept_2m_page(void)
1616 {
1617         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1618 }
1619
1620 static inline bool cpu_has_vmx_ept_1g_page(void)
1621 {
1622         return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1623 }
1624
1625 static inline bool cpu_has_vmx_ept_4levels(void)
1626 {
1627         return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1628 }
1629
1630 static inline bool cpu_has_vmx_ept_mt_wb(void)
1631 {
1632         return vmx_capability.ept & VMX_EPTP_WB_BIT;
1633 }
1634
1635 static inline bool cpu_has_vmx_ept_5levels(void)
1636 {
1637         return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1638 }
1639
1640 static inline bool cpu_has_vmx_ept_ad_bits(void)
1641 {
1642         return vmx_capability.ept & VMX_EPT_AD_BIT;
1643 }
1644
1645 static inline bool cpu_has_vmx_invept_context(void)
1646 {
1647         return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1648 }
1649
1650 static inline bool cpu_has_vmx_invept_global(void)
1651 {
1652         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1653 }
1654
1655 static inline bool cpu_has_vmx_invvpid_individual_addr(void)
1656 {
1657         return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
1658 }
1659
1660 static inline bool cpu_has_vmx_invvpid_single(void)
1661 {
1662         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1663 }
1664
1665 static inline bool cpu_has_vmx_invvpid_global(void)
1666 {
1667         return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1668 }
1669
1670 static inline bool cpu_has_vmx_invvpid(void)
1671 {
1672         return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1673 }
1674
1675 static inline bool cpu_has_vmx_ept(void)
1676 {
1677         return vmcs_config.cpu_based_2nd_exec_ctrl &
1678                 SECONDARY_EXEC_ENABLE_EPT;
1679 }
1680
1681 static inline bool cpu_has_vmx_unrestricted_guest(void)
1682 {
1683         return vmcs_config.cpu_based_2nd_exec_ctrl &
1684                 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1685 }
1686
1687 static inline bool cpu_has_vmx_ple(void)
1688 {
1689         return vmcs_config.cpu_based_2nd_exec_ctrl &
1690                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1691 }
1692
1693 static inline bool cpu_has_vmx_basic_inout(void)
1694 {
1695         return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1696 }
1697
1698 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1699 {
1700         return flexpriority_enabled && lapic_in_kernel(vcpu);
1701 }
1702
1703 static inline bool cpu_has_vmx_vpid(void)
1704 {
1705         return vmcs_config.cpu_based_2nd_exec_ctrl &
1706                 SECONDARY_EXEC_ENABLE_VPID;
1707 }
1708
1709 static inline bool cpu_has_vmx_rdtscp(void)
1710 {
1711         return vmcs_config.cpu_based_2nd_exec_ctrl &
1712                 SECONDARY_EXEC_RDTSCP;
1713 }
1714
1715 static inline bool cpu_has_vmx_invpcid(void)
1716 {
1717         return vmcs_config.cpu_based_2nd_exec_ctrl &
1718                 SECONDARY_EXEC_ENABLE_INVPCID;
1719 }
1720
1721 static inline bool cpu_has_virtual_nmis(void)
1722 {
1723         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1724 }
1725
1726 static inline bool cpu_has_vmx_wbinvd_exit(void)
1727 {
1728         return vmcs_config.cpu_based_2nd_exec_ctrl &
1729                 SECONDARY_EXEC_WBINVD_EXITING;
1730 }
1731
1732 static inline bool cpu_has_vmx_shadow_vmcs(void)
1733 {
1734         u64 vmx_msr;
1735         rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1736         /* check if the cpu supports writing r/o exit information fields */
1737         if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1738                 return false;
1739
1740         return vmcs_config.cpu_based_2nd_exec_ctrl &
1741                 SECONDARY_EXEC_SHADOW_VMCS;
1742 }
1743
1744 static inline bool cpu_has_vmx_pml(void)
1745 {
1746         return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1747 }
1748
1749 static inline bool cpu_has_vmx_tsc_scaling(void)
1750 {
1751         return vmcs_config.cpu_based_2nd_exec_ctrl &
1752                 SECONDARY_EXEC_TSC_SCALING;
1753 }
1754
1755 static inline bool cpu_has_vmx_vmfunc(void)
1756 {
1757         return vmcs_config.cpu_based_2nd_exec_ctrl &
1758                 SECONDARY_EXEC_ENABLE_VMFUNC;
1759 }
1760
1761 static bool vmx_umip_emulated(void)
1762 {
1763         return vmcs_config.cpu_based_2nd_exec_ctrl &
1764                 SECONDARY_EXEC_DESC;
1765 }
1766
1767 static inline bool report_flexpriority(void)
1768 {
1769         return flexpriority_enabled;
1770 }
1771
1772 static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1773 {
1774         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
1775 }
1776
1777 /*
1778  * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
1779  * to modify any valid field of the VMCS, or are the VM-exit
1780  * information fields read-only?
1781  */
1782 static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
1783 {
1784         return to_vmx(vcpu)->nested.msrs.misc_low &
1785                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
1786 }
1787
1788 static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
1789 {
1790         return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
1791 }
1792
1793 static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
1794 {
1795         return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
1796                         CPU_BASED_MONITOR_TRAP_FLAG;
1797 }
1798
1799 static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
1800 {
1801         return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
1802                 SECONDARY_EXEC_SHADOW_VMCS;
1803 }
1804
1805 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1806 {
1807         return vmcs12->cpu_based_vm_exec_control & bit;
1808 }
1809
1810 static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1811 {
1812         return (vmcs12->cpu_based_vm_exec_control &
1813                         CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1814                 (vmcs12->secondary_vm_exec_control & bit);
1815 }
1816
1817 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1818 {
1819         return vmcs12->pin_based_vm_exec_control &
1820                 PIN_BASED_VMX_PREEMPTION_TIMER;
1821 }
1822
1823 static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
1824 {
1825         return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
1826 }
1827
1828 static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1829 {
1830         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1831 }
1832
1833 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1834 {
1835         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1836 }
1837
1838 static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1839 {
1840         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
1841 }
1842
1843 static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
1844 {
1845         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
1846 }
1847
1848 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1849 {
1850         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1851 }
1852
1853 static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
1854 {
1855         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
1856 }
1857
1858 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1859 {
1860         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1861 }
1862
1863 static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1864 {
1865         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1866 }
1867
1868 static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1869 {
1870         return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1871 }
1872
1873 static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
1874 {
1875         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
1876 }
1877
1878 static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
1879 {
1880         return nested_cpu_has_vmfunc(vmcs12) &&
1881                 (vmcs12->vm_function_control &
1882                  VMX_VMFUNC_EPTP_SWITCHING);
1883 }
1884
1885 static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
1886 {
1887         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
1888 }
1889
1890 static inline bool is_nmi(u32 intr_info)
1891 {
1892         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1893                 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
1894 }
1895
1896 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1897                               u32 exit_intr_info,
1898                               unsigned long exit_qualification);
1899 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1900                         struct vmcs12 *vmcs12,
1901                         u32 reason, unsigned long qualification);
1902
1903 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1904 {
1905         int i;
1906
1907         for (i = 0; i < vmx->nmsrs; ++i)
1908                 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1909                         return i;
1910         return -1;
1911 }
1912
1913 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1914 {
1915     struct {
1916         u64 vpid : 16;
1917         u64 rsvd : 48;
1918         u64 gva;
1919     } operand = { vpid, 0, gva };
1920     bool error;
1921
1922     asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
1923                   : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
1924                   : "memory");
1925     BUG_ON(error);
1926 }
1927
1928 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1929 {
1930         struct {
1931                 u64 eptp, gpa;
1932         } operand = {eptp, gpa};
1933         bool error;
1934
1935         asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
1936                       : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
1937                       : "memory");
1938         BUG_ON(error);
1939 }
1940
1941 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1942 {
1943         int i;
1944
1945         i = __find_msr_index(vmx, msr);
1946         if (i >= 0)
1947                 return &vmx->guest_msrs[i];
1948         return NULL;
1949 }
1950
1951 static void vmcs_clear(struct vmcs *vmcs)
1952 {
1953         u64 phys_addr = __pa(vmcs);
1954         bool error;
1955
1956         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
1957                       : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
1958                       : "memory");
1959         if (unlikely(error))
1960                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1961                        vmcs, phys_addr);
1962 }
1963
1964 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1965 {
1966         vmcs_clear(loaded_vmcs->vmcs);
1967         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1968                 vmcs_clear(loaded_vmcs->shadow_vmcs);
1969         loaded_vmcs->cpu = -1;
1970         loaded_vmcs->launched = 0;
1971 }
1972
1973 static void vmcs_load(struct vmcs *vmcs)
1974 {
1975         u64 phys_addr = __pa(vmcs);
1976         bool error;
1977
1978         if (static_branch_unlikely(&enable_evmcs))
1979                 return evmcs_load(phys_addr);
1980
1981         asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
1982                       : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
1983                       : "memory");
1984         if (unlikely(error))
1985                 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1986                        vmcs, phys_addr);
1987 }
1988
1989 #ifdef CONFIG_KEXEC_CORE
1990 /*
1991  * This bitmap is used to indicate whether the vmclear
1992  * operation is enabled on all cpus. All disabled by
1993  * default.
1994  */
1995 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1996
1997 static inline void crash_enable_local_vmclear(int cpu)
1998 {
1999         cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
2000 }
2001
2002 static inline void crash_disable_local_vmclear(int cpu)
2003 {
2004         cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
2005 }
2006
2007 static inline int crash_local_vmclear_enabled(int cpu)
2008 {
2009         return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
2010 }
2011
2012 static void crash_vmclear_local_loaded_vmcss(void)
2013 {
2014         int cpu = raw_smp_processor_id();
2015         struct loaded_vmcs *v;
2016
2017         if (!crash_local_vmclear_enabled(cpu))
2018                 return;
2019
2020         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2021                             loaded_vmcss_on_cpu_link)
2022                 vmcs_clear(v->vmcs);
2023 }
2024 #else
2025 static inline void crash_enable_local_vmclear(int cpu) { }
2026 static inline void crash_disable_local_vmclear(int cpu) { }
2027 #endif /* CONFIG_KEXEC_CORE */
2028
2029 static void __loaded_vmcs_clear(void *arg)
2030 {
2031         struct loaded_vmcs *loaded_vmcs = arg;
2032         int cpu = raw_smp_processor_id();
2033
2034         if (loaded_vmcs->cpu != cpu)
2035                 return; /* vcpu migration can race with cpu offline */
2036         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
2037                 per_cpu(current_vmcs, cpu) = NULL;
2038         crash_disable_local_vmclear(cpu);
2039         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
2040
2041         /*
2042          * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
2043          * is before setting loaded_vmcs->vcpu to -1 which is done in
2044          * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
2045          * then adds the vmcs into percpu list before it is deleted.
2046          */
2047         smp_wmb();
2048
2049         loaded_vmcs_init(loaded_vmcs);
2050         crash_enable_local_vmclear(cpu);
2051 }
2052
2053 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
2054 {
2055         int cpu = loaded_vmcs->cpu;
2056
2057         if (cpu != -1)
2058                 smp_call_function_single(cpu,
2059                          __loaded_vmcs_clear, loaded_vmcs, 1);
2060 }
2061
2062 static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
2063 {
2064         if (vpid == 0)
2065                 return true;
2066
2067         if (cpu_has_vmx_invvpid_individual_addr()) {
2068                 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
2069                 return true;
2070         }
2071
2072         return false;
2073 }
2074
2075 static inline void vpid_sync_vcpu_single(int vpid)
2076 {
2077         if (vpid == 0)
2078                 return;
2079
2080         if (cpu_has_vmx_invvpid_single())
2081                 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
2082 }
2083
2084 static inline void vpid_sync_vcpu_global(void)
2085 {
2086         if (cpu_has_vmx_invvpid_global())
2087                 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
2088 }
2089
2090 static inline void vpid_sync_context(int vpid)
2091 {
2092         if (cpu_has_vmx_invvpid_single())
2093                 vpid_sync_vcpu_single(vpid);
2094         else
2095                 vpid_sync_vcpu_global();
2096 }
2097
2098 static inline void ept_sync_global(void)
2099 {
2100         __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
2101 }
2102
2103 static inline void ept_sync_context(u64 eptp)
2104 {
2105         if (cpu_has_vmx_invept_context())
2106                 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
2107         else
2108                 ept_sync_global();
2109 }
2110
2111 static __always_inline void vmcs_check16(unsigned long field)
2112 {
2113         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2114                          "16-bit accessor invalid for 64-bit field");
2115         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2116                          "16-bit accessor invalid for 64-bit high field");
2117         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2118                          "16-bit accessor invalid for 32-bit high field");
2119         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2120                          "16-bit accessor invalid for natural width field");
2121 }
2122
2123 static __always_inline void vmcs_check32(unsigned long field)
2124 {
2125         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2126                          "32-bit accessor invalid for 16-bit field");
2127         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2128                          "32-bit accessor invalid for natural width field");
2129 }
2130
2131 static __always_inline void vmcs_check64(unsigned long field)
2132 {
2133         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2134                          "64-bit accessor invalid for 16-bit field");
2135         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2136                          "64-bit accessor invalid for 64-bit high field");
2137         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2138                          "64-bit accessor invalid for 32-bit field");
2139         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2140                          "64-bit accessor invalid for natural width field");
2141 }
2142
2143 static __always_inline void vmcs_checkl(unsigned long field)
2144 {
2145         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2146                          "Natural width accessor invalid for 16-bit field");
2147         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2148                          "Natural width accessor invalid for 64-bit field");
2149         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2150                          "Natural width accessor invalid for 64-bit high field");
2151         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2152                          "Natural width accessor invalid for 32-bit field");
2153 }
2154
2155 static __always_inline unsigned long __vmcs_readl(unsigned long field)
2156 {
2157         unsigned long value;
2158
2159         asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
2160                       : "=a"(value) : "d"(field) : "cc");
2161         return value;
2162 }
2163
2164 static __always_inline u16 vmcs_read16(unsigned long field)
2165 {
2166         vmcs_check16(field);
2167         if (static_branch_unlikely(&enable_evmcs))
2168                 return evmcs_read16(field);
2169         return __vmcs_readl(field);
2170 }
2171
2172 static __always_inline u32 vmcs_read32(unsigned long field)
2173 {
2174         vmcs_check32(field);
2175         if (static_branch_unlikely(&enable_evmcs))
2176                 return evmcs_read32(field);
2177         return __vmcs_readl(field);
2178 }
2179
2180 static __always_inline u64 vmcs_read64(unsigned long field)
2181 {
2182         vmcs_check64(field);
2183         if (static_branch_unlikely(&enable_evmcs))
2184                 return evmcs_read64(field);
2185 #ifdef CONFIG_X86_64
2186         return __vmcs_readl(field);
2187 #else
2188         return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
2189 #endif
2190 }
2191
2192 static __always_inline unsigned long vmcs_readl(unsigned long field)
2193 {
2194         vmcs_checkl(field);
2195         if (static_branch_unlikely(&enable_evmcs))
2196                 return evmcs_read64(field);
2197         return __vmcs_readl(field);
2198 }
2199
2200 static noinline void vmwrite_error(unsigned long field, unsigned long value)
2201 {
2202         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
2203                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
2204         dump_stack();
2205 }
2206
2207 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
2208 {
2209         bool error;
2210
2211         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
2212                       : CC_OUT(na) (error) : "a"(value), "d"(field));
2213         if (unlikely(error))
2214                 vmwrite_error(field, value);
2215 }
2216
2217 static __always_inline void vmcs_write16(unsigned long field, u16 value)
2218 {
2219         vmcs_check16(field);
2220         if (static_branch_unlikely(&enable_evmcs))
2221                 return evmcs_write16(field, value);
2222
2223         __vmcs_writel(field, value);
2224 }
2225
2226 static __always_inline void vmcs_write32(unsigned long field, u32 value)
2227 {
2228         vmcs_check32(field);
2229         if (static_branch_unlikely(&enable_evmcs))
2230                 return evmcs_write32(field, value);
2231
2232         __vmcs_writel(field, value);
2233 }
2234
2235 static __always_inline void vmcs_write64(unsigned long field, u64 value)
2236 {
2237         vmcs_check64(field);
2238         if (static_branch_unlikely(&enable_evmcs))
2239                 return evmcs_write64(field, value);
2240
2241         __vmcs_writel(field, value);
2242 #ifndef CONFIG_X86_64
2243         asm volatile ("");
2244         __vmcs_writel(field+1, value >> 32);
2245 #endif
2246 }
2247
2248 static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
2249 {
2250         vmcs_checkl(field);
2251         if (static_branch_unlikely(&enable_evmcs))
2252                 return evmcs_write64(field, value);
2253
2254         __vmcs_writel(field, value);
2255 }
2256
2257 static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
2258 {
2259         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2260                          "vmcs_clear_bits does not support 64-bit fields");
2261         if (static_branch_unlikely(&enable_evmcs))
2262                 return evmcs_write32(field, evmcs_read32(field) & ~mask);
2263
2264         __vmcs_writel(field, __vmcs_readl(field) & ~mask);
2265 }
2266
2267 static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
2268 {
2269         BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2270                          "vmcs_set_bits does not support 64-bit fields");
2271         if (static_branch_unlikely(&enable_evmcs))
2272                 return evmcs_write32(field, evmcs_read32(field) | mask);
2273
2274         __vmcs_writel(field, __vmcs_readl(field) | mask);
2275 }
2276
2277 static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
2278 {
2279         vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
2280 }
2281
2282 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
2283 {
2284         vmcs_write32(VM_ENTRY_CONTROLS, val);
2285         vmx->vm_entry_controls_shadow = val;
2286 }
2287
2288 static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
2289 {
2290         if (vmx->vm_entry_controls_shadow != val)
2291                 vm_entry_controls_init(vmx, val);
2292 }
2293
2294 static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
2295 {
2296         return vmx->vm_entry_controls_shadow;
2297 }
2298
2299
2300 static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2301 {
2302         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
2303 }
2304
2305 static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2306 {
2307         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
2308 }
2309
2310 static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
2311 {
2312         vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
2313 }
2314
2315 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
2316 {
2317         vmcs_write32(VM_EXIT_CONTROLS, val);
2318         vmx->vm_exit_controls_shadow = val;
2319 }
2320
2321 static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
2322 {
2323         if (vmx->vm_exit_controls_shadow != val)
2324                 vm_exit_controls_init(vmx, val);
2325 }
2326
2327 static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
2328 {
2329         return vmx->vm_exit_controls_shadow;
2330 }
2331
2332
2333 static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2334 {
2335         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
2336 }
2337
2338 static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2339 {
2340         vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
2341 }
2342
2343 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
2344 {
2345         vmx->segment_cache.bitmask = 0;
2346 }
2347
2348 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2349                                        unsigned field)
2350 {
2351         bool ret;
2352         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2353
2354         if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
2355                 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
2356                 vmx->segment_cache.bitmask = 0;
2357         }
2358         ret = vmx->segment_cache.bitmask & mask;
2359         vmx->segment_cache.bitmask |= mask;
2360         return ret;
2361 }
2362
2363 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2364 {
2365         u16 *p = &vmx->segment_cache.seg[seg].selector;
2366
2367         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2368                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2369         return *p;
2370 }
2371
2372 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2373 {
2374         ulong *p = &vmx->segment_cache.seg[seg].base;
2375
2376         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2377                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2378         return *p;
2379 }
2380
2381 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2382 {
2383         u32 *p = &vmx->segment_cache.seg[seg].limit;
2384
2385         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2386                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2387         return *p;
2388 }
2389
2390 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2391 {
2392         u32 *p = &vmx->segment_cache.seg[seg].ar;
2393
2394         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2395                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2396         return *p;
2397 }
2398
2399 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2400 {
2401         u32 eb;
2402
2403         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
2404              (1u << DB_VECTOR) | (1u << AC_VECTOR);
2405         /*
2406          * Guest access to VMware backdoor ports could legitimately
2407          * trigger #GP because of TSS I/O permission bitmap.
2408          * We intercept those #GP and allow access to them anyway
2409          * as VMware does.
2410          */
2411         if (enable_vmware_backdoor)
2412                 eb |= (1u << GP_VECTOR);
2413         if ((vcpu->guest_debug &
2414              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2415             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2416                 eb |= 1u << BP_VECTOR;
2417         if (to_vmx(vcpu)->rmode.vm86_active)
2418                 eb = ~0;
2419         if (enable_ept)
2420                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
2421
2422         /* When we are running a nested L2 guest and L1 specified for it a
2423          * certain exception bitmap, we must trap the same exceptions and pass
2424          * them to L1. When running L2, we will only handle the exceptions
2425          * specified above if L1 did not want them.
2426          */
2427         if (is_guest_mode(vcpu))
2428                 eb |= get_vmcs12(vcpu)->exception_bitmap;
2429
2430         vmcs_write32(EXCEPTION_BITMAP, eb);
2431 }
2432
2433 /*
2434  * Check if MSR is intercepted for currently loaded MSR bitmap.
2435  */
2436 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2437 {
2438         unsigned long *msr_bitmap;
2439         int f = sizeof(unsigned long);
2440
2441         if (!cpu_has_vmx_msr_bitmap())
2442                 return true;
2443
2444         msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2445
2446         if (msr <= 0x1fff) {
2447                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2448         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2449                 msr &= 0x1fff;
2450                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2451         }
2452
2453         return true;
2454 }
2455
2456 /*
2457  * Check if MSR is intercepted for L01 MSR bitmap.
2458  */
2459 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2460 {
2461         unsigned long *msr_bitmap;
2462         int f = sizeof(unsigned long);
2463
2464         if (!cpu_has_vmx_msr_bitmap())
2465                 return true;
2466
2467         msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2468
2469         if (msr <= 0x1fff) {
2470                 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2471         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2472                 msr &= 0x1fff;
2473                 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2474         }
2475
2476         return true;
2477 }
2478
2479 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2480                 unsigned long entry, unsigned long exit)
2481 {
2482         vm_entry_controls_clearbit(vmx, entry);
2483         vm_exit_controls_clearbit(vmx, exit);
2484 }
2485
2486 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2487 {
2488         unsigned i;
2489         struct msr_autoload *m = &vmx->msr_autoload;
2490
2491         switch (msr) {
2492         case MSR_EFER:
2493                 if (cpu_has_load_ia32_efer) {
2494                         clear_atomic_switch_msr_special(vmx,
2495                                         VM_ENTRY_LOAD_IA32_EFER,
2496                                         VM_EXIT_LOAD_IA32_EFER);
2497                         return;
2498                 }
2499                 break;
2500         case MSR_CORE_PERF_GLOBAL_CTRL:
2501                 if (cpu_has_load_perf_global_ctrl) {
2502                         clear_atomic_switch_msr_special(vmx,
2503                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2504                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2505                         return;
2506                 }
2507                 break;
2508         }
2509
2510         for (i = 0; i < m->nr; ++i)
2511                 if (m->guest[i].index == msr)
2512                         break;
2513
2514         if (i == m->nr)
2515                 return;
2516         --m->nr;
2517         m->guest[i] = m->guest[m->nr];
2518         m->host[i] = m->host[m->nr];
2519         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
2520         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
2521 }
2522
2523 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2524                 unsigned long entry, unsigned long exit,
2525                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2526                 u64 guest_val, u64 host_val)
2527 {
2528         vmcs_write64(guest_val_vmcs, guest_val);
2529         vmcs_write64(host_val_vmcs, host_val);
2530         vm_entry_controls_setbit(vmx, entry);
2531         vm_exit_controls_setbit(vmx, exit);
2532 }
2533
2534 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
2535                                   u64 guest_val, u64 host_val)
2536 {
2537         unsigned i;
2538         struct msr_autoload *m = &vmx->msr_autoload;
2539
2540         switch (msr) {
2541         case MSR_EFER:
2542                 if (cpu_has_load_ia32_efer) {
2543                         add_atomic_switch_msr_special(vmx,
2544                                         VM_ENTRY_LOAD_IA32_EFER,
2545                                         VM_EXIT_LOAD_IA32_EFER,
2546                                         GUEST_IA32_EFER,
2547                                         HOST_IA32_EFER,
2548                                         guest_val, host_val);
2549                         return;
2550                 }
2551                 break;
2552         case MSR_CORE_PERF_GLOBAL_CTRL:
2553                 if (cpu_has_load_perf_global_ctrl) {
2554                         add_atomic_switch_msr_special(vmx,
2555                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2556                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2557                                         GUEST_IA32_PERF_GLOBAL_CTRL,
2558                                         HOST_IA32_PERF_GLOBAL_CTRL,
2559                                         guest_val, host_val);
2560                         return;
2561                 }
2562                 break;
2563         case MSR_IA32_PEBS_ENABLE:
2564                 /* PEBS needs a quiescent period after being disabled (to write
2565                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
2566                  * provide that period, so a CPU could write host's record into
2567                  * guest's memory.
2568                  */
2569                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2570         }
2571
2572         for (i = 0; i < m->nr; ++i)
2573                 if (m->guest[i].index == msr)
2574                         break;
2575
2576         if (i == NR_AUTOLOAD_MSRS) {
2577                 printk_once(KERN_WARNING "Not enough msr switch entries. "
2578                                 "Can't add msr %x\n", msr);
2579                 return;
2580         } else if (i == m->nr) {
2581                 ++m->nr;
2582                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
2583                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
2584         }
2585
2586         m->guest[i].index = msr;
2587         m->guest[i].value = guest_val;
2588         m->host[i].index = msr;
2589         m->host[i].value = host_val;
2590 }
2591
2592 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2593 {
2594         u64 guest_efer = vmx->vcpu.arch.efer;
2595         u64 ignore_bits = 0;
2596
2597         if (!enable_ept) {
2598                 /*
2599                  * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
2600                  * host CPUID is more efficient than testing guest CPUID
2601                  * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
2602                  */
2603                 if (boot_cpu_has(X86_FEATURE_SMEP))
2604                         guest_efer |= EFER_NX;
2605                 else if (!(guest_efer & EFER_NX))
2606                         ignore_bits |= EFER_NX;
2607         }
2608
2609         /*
2610          * LMA and LME handled by hardware; SCE meaningless outside long mode.
2611          */
2612         ignore_bits |= EFER_SCE;
2613 #ifdef CONFIG_X86_64
2614         ignore_bits |= EFER_LMA | EFER_LME;
2615         /* SCE is meaningful only in long mode on Intel */
2616         if (guest_efer & EFER_LMA)
2617                 ignore_bits &= ~(u64)EFER_SCE;
2618 #endif
2619
2620         clear_atomic_switch_msr(vmx, MSR_EFER);
2621
2622         /*
2623          * On EPT, we can't emulate NX, so we must switch EFER atomically.
2624          * On CPUs that support "load IA32_EFER", always switch EFER
2625          * atomically, since it's faster than switching it manually.
2626          */
2627         if (cpu_has_load_ia32_efer ||
2628             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2629                 if (!(guest_efer & EFER_LMA))
2630                         guest_efer &= ~EFER_LME;
2631                 if (guest_efer != host_efer)
2632                         add_atomic_switch_msr(vmx, MSR_EFER,
2633                                               guest_efer, host_efer);
2634                 return false;
2635         } else {
2636                 guest_efer &= ~ignore_bits;
2637                 guest_efer |= host_efer & ignore_bits;
2638
2639                 vmx->guest_msrs[efer_offset].data = guest_efer;
2640                 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2641
2642                 return true;
2643         }
2644 }
2645
2646 #ifdef CONFIG_X86_32
2647 /*
2648  * On 32-bit kernels, VM exits still load the FS and GS bases from the
2649  * VMCS rather than the segment table.  KVM uses this helper to figure
2650  * out the current bases to poke them into the VMCS before entry.
2651  */
2652 static unsigned long segment_base(u16 selector)
2653 {
2654         struct desc_struct *table;
2655         unsigned long v;
2656
2657         if (!(selector & ~SEGMENT_RPL_MASK))
2658                 return 0;
2659
2660         table = get_current_gdt_ro();
2661
2662         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2663                 u16 ldt_selector = kvm_read_ldt();
2664
2665                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2666                         return 0;
2667
2668                 table = (struct desc_struct *)segment_base(ldt_selector);
2669         }
2670         v = get_desc_base(&table[selector >> 3]);
2671         return v;
2672 }
2673 #endif
2674
2675 static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
2676 {
2677         struct vcpu_vmx *vmx = to_vmx(vcpu);
2678         struct vmcs_host_state *host_state;
2679 #ifdef CONFIG_X86_64
2680         int cpu = raw_smp_processor_id();
2681 #endif
2682         unsigned long fs_base, gs_base;
2683         u16 fs_sel, gs_sel;
2684         int i;
2685
2686         if (vmx->loaded_cpu_state)
2687                 return;
2688
2689         vmx->loaded_cpu_state = vmx->loaded_vmcs;
2690         host_state = &vmx->loaded_cpu_state->host_state;
2691
2692         /*
2693          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
2694          * allow segment selectors with cpl > 0 or ti == 1.
2695          */
2696         host_state->ldt_sel = kvm_read_ldt();
2697
2698 #ifdef CONFIG_X86_64
2699         savesegment(ds, host_state->ds_sel);
2700         savesegment(es, host_state->es_sel);
2701
2702         gs_base = cpu_kernelmode_gs_base(cpu);
2703         if (likely(is_64bit_mm(current->mm))) {
2704                 save_fsgs_for_kvm();
2705                 fs_sel = current->thread.fsindex;
2706                 gs_sel = current->thread.gsindex;
2707                 fs_base = current->thread.fsbase;
2708                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2709         } else {
2710                 savesegment(fs, fs_sel);
2711                 savesegment(gs, gs_sel);
2712                 fs_base = read_msr(MSR_FS_BASE);
2713                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2714         }
2715
2716         if (is_long_mode(&vmx->vcpu))
2717                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2718 #else
2719         savesegment(fs, fs_sel);
2720         savesegment(gs, gs_sel);
2721         fs_base = segment_base(fs_sel);
2722         gs_base = segment_base(gs_sel);
2723 #endif
2724
2725         if (unlikely(fs_sel != host_state->fs_sel)) {
2726                 if (!(fs_sel & 7))
2727                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2728                 else
2729                         vmcs_write16(HOST_FS_SELECTOR, 0);
2730                 host_state->fs_sel = fs_sel;
2731         }
2732         if (unlikely(gs_sel != host_state->gs_sel)) {
2733                 if (!(gs_sel & 7))
2734                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2735                 else
2736                         vmcs_write16(HOST_GS_SELECTOR, 0);
2737                 host_state->gs_sel = gs_sel;
2738         }
2739         if (unlikely(fs_base != host_state->fs_base)) {
2740                 vmcs_writel(HOST_FS_BASE, fs_base);
2741                 host_state->fs_base = fs_base;
2742         }
2743         if (unlikely(gs_base != host_state->gs_base)) {
2744                 vmcs_writel(HOST_GS_BASE, gs_base);
2745                 host_state->gs_base = gs_base;
2746         }
2747
2748         for (i = 0; i < vmx->save_nmsrs; ++i)
2749                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2750                                    vmx->guest_msrs[i].data,
2751                                    vmx->guest_msrs[i].mask);
2752 }
2753
2754 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2755 {
2756         struct vmcs_host_state *host_state;
2757
2758         if (!vmx->loaded_cpu_state)
2759                 return;
2760
2761         WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
2762         host_state = &vmx->loaded_cpu_state->host_state;
2763
2764         ++vmx->vcpu.stat.host_state_reload;
2765         vmx->loaded_cpu_state = NULL;
2766
2767 #ifdef CONFIG_X86_64
2768         if (is_long_mode(&vmx->vcpu))
2769                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2770 #endif
2771         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2772                 kvm_load_ldt(host_state->ldt_sel);
2773 #ifdef CONFIG_X86_64
2774                 load_gs_index(host_state->gs_sel);
2775 #else
2776                 loadsegment(gs, host_state->gs_sel);
2777 #endif
2778         }
2779         if (host_state->fs_sel & 7)
2780                 loadsegment(fs, host_state->fs_sel);
2781 #ifdef CONFIG_X86_64
2782         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
2783                 loadsegment(ds, host_state->ds_sel);
2784                 loadsegment(es, host_state->es_sel);
2785         }
2786 #endif
2787         invalidate_tss_limit();
2788 #ifdef CONFIG_X86_64
2789         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2790 #endif
2791         load_fixmap_gdt(raw_smp_processor_id());
2792 }
2793
2794 #ifdef CONFIG_X86_64
2795 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
2796 {
2797         if (is_long_mode(&vmx->vcpu)) {
2798                 preempt_disable();
2799                 if (vmx->loaded_cpu_state)
2800                         rdmsrl(MSR_KERNEL_GS_BASE,
2801                                vmx->msr_guest_kernel_gs_base);
2802                 preempt_enable();
2803         }
2804         return vmx->msr_guest_kernel_gs_base;
2805 }
2806
2807 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
2808 {
2809         if (is_long_mode(&vmx->vcpu)) {
2810                 preempt_disable();
2811                 if (vmx->loaded_cpu_state)
2812                         wrmsrl(MSR_KERNEL_GS_BASE, data);
2813                 preempt_enable();
2814         }
2815         vmx->msr_guest_kernel_gs_base = data;
2816 }
2817 #endif
2818
2819 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2820 {
2821         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2822         struct pi_desc old, new;
2823         unsigned int dest;
2824
2825         /*
2826          * In case of hot-plug or hot-unplug, we may have to undo
2827          * vmx_vcpu_pi_put even if there is no assigned device.  And we
2828          * always keep PI.NDST up to date for simplicity: it makes the
2829          * code easier, and CPU migration is not a fast path.
2830          */
2831         if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2832                 return;
2833
2834         /*
2835          * First handle the simple case where no cmpxchg is necessary; just
2836          * allow posting non-urgent interrupts.
2837          *
2838          * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2839          * PI.NDST: pi_post_block will do it for us and the wakeup_handler
2840          * expects the VCPU to be on the blocked_vcpu_list that matches
2841          * PI.NDST.
2842          */
2843         if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
2844             vcpu->cpu == cpu) {
2845                 pi_clear_sn(pi_desc);
2846                 return;
2847         }
2848
2849         /* The full case.  */
2850         do {
2851                 old.control = new.control = pi_desc->control;
2852
2853                 dest = cpu_physical_id(cpu);
2854
2855                 if (x2apic_enabled())
2856                         new.ndst = dest;
2857                 else
2858                         new.ndst = (dest << 8) & 0xFF00;
2859
2860                 new.sn = 0;
2861         } while (cmpxchg64(&pi_desc->control, old.control,
2862                            new.control) != old.control);
2863 }
2864
2865 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
2866 {
2867         vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
2868         vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
2869 }
2870
2871 /*
2872  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2873  * vcpu mutex is already taken.
2874  */
2875 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2876 {
2877         struct vcpu_vmx *vmx = to_vmx(vcpu);
2878         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2879
2880         if (!already_loaded) {
2881                 loaded_vmcs_clear(vmx->loaded_vmcs);
2882                 local_irq_disable();
2883                 crash_disable_local_vmclear(cpu);
2884
2885                 /*
2886                  * Read loaded_vmcs->cpu should be before fetching
2887                  * loaded_vmcs->loaded_vmcss_on_cpu_link.
2888                  * See the comments in __loaded_vmcs_clear().
2889                  */
2890                 smp_rmb();
2891
2892                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2893                          &per_cpu(loaded_vmcss_on_cpu, cpu));
2894                 crash_enable_local_vmclear(cpu);
2895                 local_irq_enable();
2896         }
2897
2898         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2899                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2900                 vmcs_load(vmx->loaded_vmcs->vmcs);
2901                 indirect_branch_prediction_barrier();
2902         }
2903
2904         if (!already_loaded) {
2905                 void *gdt = get_current_gdt_ro();
2906                 unsigned long sysenter_esp;
2907
2908                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2909
2910                 /*
2911                  * Linux uses per-cpu TSS and GDT, so set these when switching
2912                  * processors.  See 22.2.4.
2913                  */
2914                 vmcs_writel(HOST_TR_BASE,
2915                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2916                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
2917
2918                 /*
2919                  * VM exits change the host TR limit to 0x67 after a VM
2920                  * exit.  This is okay, since 0x67 covers everything except
2921                  * the IO bitmap and have have code to handle the IO bitmap
2922                  * being lost after a VM exit.
2923                  */
2924                 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2925
2926                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2927                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2928
2929                 vmx->loaded_vmcs->cpu = cpu;
2930         }
2931
2932         /* Setup TSC multiplier */
2933         if (kvm_has_tsc_control &&
2934             vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2935                 decache_tsc_multiplier(vmx);
2936
2937         vmx_vcpu_pi_load(vcpu, cpu);
2938         vmx->host_pkru = read_pkru();
2939         vmx->host_debugctlmsr = get_debugctlmsr();
2940 }
2941
2942 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2943 {
2944         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2945
2946         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2947                 !irq_remapping_cap(IRQ_POSTING_CAP)  ||
2948                 !kvm_vcpu_apicv_active(vcpu))
2949                 return;
2950
2951         /* Set SN when the vCPU is preempted */
2952         if (vcpu->preempted)
2953                 pi_set_sn(pi_desc);
2954 }
2955
2956 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2957 {
2958         vmx_vcpu_pi_put(vcpu);
2959
2960         vmx_prepare_switch_to_host(to_vmx(vcpu));
2961 }
2962
2963 static bool emulation_required(struct kvm_vcpu *vcpu)
2964 {
2965         return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2966 }
2967
2968 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2969
2970 /*
2971  * Return the cr0 value that a nested guest would read. This is a combination
2972  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2973  * its hypervisor (cr0_read_shadow).
2974  */
2975 static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2976 {
2977         return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2978                 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2979 }
2980 static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2981 {
2982         return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2983                 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2984 }
2985
2986 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2987 {
2988         unsigned long rflags, save_rflags;
2989
2990         if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2991                 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2992                 rflags = vmcs_readl(GUEST_RFLAGS);
2993                 if (to_vmx(vcpu)->rmode.vm86_active) {
2994                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2995                         save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2996                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2997                 }
2998                 to_vmx(vcpu)->rflags = rflags;
2999         }
3000         return to_vmx(vcpu)->rflags;
3001 }
3002
3003 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3004 {
3005         unsigned long old_rflags = vmx_get_rflags(vcpu);
3006
3007         __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3008         to_vmx(vcpu)->rflags = rflags;
3009         if (to_vmx(vcpu)->rmode.vm86_active) {
3010                 to_vmx(vcpu)->rmode.save_rflags = rflags;
3011                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3012         }
3013         vmcs_writel(GUEST_RFLAGS, rflags);
3014
3015         if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
3016                 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
3017 }
3018
3019 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
3020 {
3021         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3022         int ret = 0;
3023
3024         if (interruptibility & GUEST_INTR_STATE_STI)
3025                 ret |= KVM_X86_SHADOW_INT_STI;
3026         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
3027                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
3028
3029         return ret;
3030 }
3031
3032 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3033 {
3034         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3035         u32 interruptibility = interruptibility_old;
3036
3037         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3038
3039         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
3040                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
3041         else if (mask & KVM_X86_SHADOW_INT_STI)
3042                 interruptibility |= GUEST_INTR_STATE_STI;
3043
3044         if ((interruptibility != interruptibility_old))
3045                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3046 }
3047
3048 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3049 {
3050         unsigned long rip;
3051
3052         rip = kvm_rip_read(vcpu);
3053         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3054         kvm_rip_write(vcpu, rip);
3055
3056         /* skipping an emulated instruction also counts */
3057         vmx_set_interrupt_shadow(vcpu, 0);
3058 }
3059
3060 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3061                                                unsigned long exit_qual)
3062 {
3063         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3064         unsigned int nr = vcpu->arch.exception.nr;
3065         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3066
3067         if (vcpu->arch.exception.has_error_code) {
3068                 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3069                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3070         }
3071
3072         if (kvm_exception_is_soft(nr))
3073                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3074         else
3075                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3076
3077         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3078             vmx_get_nmi_mask(vcpu))
3079                 intr_info |= INTR_INFO_UNBLOCK_NMI;
3080
3081         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3082 }
3083
3084 /*
3085  * KVM wants to inject page-faults which it got to the guest. This function
3086  * checks whether in a nested guest, we need to inject them to L1 or L2.
3087  */
3088 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
3089 {
3090         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3091         unsigned int nr = vcpu->arch.exception.nr;
3092
3093         if (nr == PF_VECTOR) {
3094                 if (vcpu->arch.exception.nested_apf) {
3095                         *exit_qual = vcpu->arch.apf.nested_apf_token;
3096                         return 1;
3097                 }
3098                 /*
3099                  * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
3100                  * The fix is to add the ancillary datum (CR2 or DR6) to structs
3101                  * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
3102                  * can be written only when inject_pending_event runs.  This should be
3103                  * conditional on a new capability---if the capability is disabled,
3104                  * kvm_multiple_exception would write the ancillary information to
3105                  * CR2 or DR6, for backwards ABI-compatibility.
3106                  */
3107                 if (nested_vmx_is_page_fault_vmexit(vmcs12,
3108                                                     vcpu->arch.exception.error_code)) {
3109                         *exit_qual = vcpu->arch.cr2;
3110                         return 1;
3111                 }
3112         } else {
3113                 if (vmcs12->exception_bitmap & (1u << nr)) {
3114                         if (nr == DB_VECTOR)
3115                                 *exit_qual = vcpu->arch.dr6;
3116                         else
3117                                 *exit_qual = 0;
3118                         return 1;
3119                 }
3120         }
3121
3122         return 0;
3123 }
3124
3125 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3126 {
3127         /*
3128          * Ensure that we clear the HLT state in the VMCS.  We don't need to
3129          * explicitly skip the instruction because if the HLT state is set,
3130          * then the instruction is already executing and RIP has already been
3131          * advanced.
3132          */
3133         if (kvm_hlt_in_guest(vcpu->kvm) &&
3134                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3135                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3136 }
3137
3138 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
3139 {
3140         struct vcpu_vmx *vmx = to_vmx(vcpu);
3141         unsigned nr = vcpu->arch.exception.nr;
3142         bool has_error_code = vcpu->arch.exception.has_error_code;
3143         u32 error_code = vcpu->arch.exception.error_code;
3144         u32 intr_info = nr | INTR_INFO_VALID_MASK;
3145
3146         if (has_error_code) {
3147                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
3148                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3149         }
3150
3151         if (vmx->rmode.vm86_active) {
3152                 int inc_eip = 0;
3153                 if (kvm_exception_is_soft(nr))
3154                         inc_eip = vcpu->arch.event_exit_inst_len;
3155                 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
3156                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3157                 return;
3158         }
3159
3160         WARN_ON_ONCE(vmx->emulation_required);
3161
3162         if (kvm_exception_is_soft(nr)) {
3163                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3164                              vmx->vcpu.arch.event_exit_inst_len);
3165                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3166         } else
3167                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3168
3169         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
3170
3171         vmx_clear_hlt(vcpu);
3172 }
3173
3174 static bool vmx_rdtscp_supported(void)
3175 {
3176         return cpu_has_vmx_rdtscp();
3177 }
3178
3179 static bool vmx_invpcid_supported(void)
3180 {
3181         return cpu_has_vmx_invpcid();
3182 }
3183
3184 /*
3185  * Swap MSR entry in host/guest MSR entry array.
3186  */
3187 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
3188 {
3189         struct shared_msr_entry tmp;
3190
3191         tmp = vmx->guest_msrs[to];
3192         vmx->guest_msrs[to] = vmx->guest_msrs[from];
3193         vmx->guest_msrs[from] = tmp;
3194 }
3195
3196 /*
3197  * Set up the vmcs to automatically save and restore system
3198  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
3199  * mode, as fiddling with msrs is very expensive.
3200  */
3201 static void setup_msrs(struct vcpu_vmx *vmx)
3202 {
3203         int save_nmsrs, index;
3204
3205         save_nmsrs = 0;
3206 #ifdef CONFIG_X86_64
3207         if (is_long_mode(&vmx->vcpu)) {
3208                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
3209                 if (index >= 0)
3210                         move_msr_up(vmx, index, save_nmsrs++);
3211                 index = __find_msr_index(vmx, MSR_LSTAR);
3212                 if (index >= 0)
3213                         move_msr_up(vmx, index, save_nmsrs++);
3214                 index = __find_msr_index(vmx, MSR_CSTAR);
3215                 if (index >= 0)
3216                         move_msr_up(vmx, index, save_nmsrs++);
3217                 index = __find_msr_index(vmx, MSR_TSC_AUX);
3218                 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
3219                         move_msr_up(vmx, index, save_nmsrs++);
3220                 /*
3221                  * MSR_STAR is only needed on long mode guests, and only
3222                  * if efer.sce is enabled.
3223                  */
3224                 index = __find_msr_index(vmx, MSR_STAR);
3225                 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
3226                         move_msr_up(vmx, index, save_nmsrs++);
3227         }
3228 #endif
3229         index = __find_msr_index(vmx, MSR_EFER);
3230         if (index >= 0 && update_transition_efer(vmx, index))
3231                 move_msr_up(vmx, index, save_nmsrs++);
3232
3233         vmx->save_nmsrs = save_nmsrs;
3234
3235         if (cpu_has_vmx_msr_bitmap())
3236                 vmx_update_msr_bitmap(&vmx->vcpu);
3237 }
3238
3239 static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
3240 {
3241         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3242
3243         if (is_guest_mode(vcpu) &&
3244             (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
3245                 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3246
3247         return vcpu->arch.tsc_offset;
3248 }
3249
3250 /*
3251  * writes 'offset' into guest's timestamp counter offset register
3252  */
3253 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
3254 {
3255         if (is_guest_mode(vcpu)) {
3256                 /*
3257                  * We're here if L1 chose not to trap WRMSR to TSC. According
3258                  * to the spec, this should set L1's TSC; The offset that L1
3259                  * set for L2 remains unchanged, and still needs to be added
3260                  * to the newly set TSC to get L2's TSC.
3261                  */
3262                 struct vmcs12 *vmcs12;
3263                 /* recalculate vmcs02.TSC_OFFSET: */
3264                 vmcs12 = get_vmcs12(vcpu);
3265                 vmcs_write64(TSC_OFFSET, offset +
3266                         (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
3267                          vmcs12->tsc_offset : 0));
3268         } else {
3269                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3270                                            vmcs_read64(TSC_OFFSET), offset);
3271                 vmcs_write64(TSC_OFFSET, offset);
3272         }
3273 }
3274
3275 /*
3276  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3277  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3278  * all guests if the "nested" module option is off, and can also be disabled
3279  * for a single guest by disabling its VMX cpuid bit.
3280  */
3281 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3282 {
3283         return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
3284 }
3285
3286 /*
3287  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
3288  * returned for the various VMX controls MSRs when nested VMX is enabled.
3289  * The same values should also be used to verify that vmcs12 control fields are
3290  * valid during nested entry from L1 to L2.
3291  * Each of these control msrs has a low and high 32-bit half: A low bit is on
3292  * if the corresponding bit in the (32-bit) control field *must* be on, and a
3293  * bit in the high half is on if the corresponding bit in the control field
3294  * may be on. See also vmx_control_verify().
3295  */
3296 static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
3297 {
3298         if (!nested) {
3299                 memset(msrs, 0, sizeof(*msrs));
3300                 return;
3301         }
3302
3303         /*
3304          * Note that as a general rule, the high half of the MSRs (bits in
3305          * the control fields which may be 1) should be initialized by the
3306          * intersection of the underlying hardware's MSR (i.e., features which
3307          * can be supported) and the list of features we want to expose -
3308          * because they are known to be properly supported in our code.
3309          * Also, usually, the low half of the MSRs (bits which must be 1) can
3310          * be set to 0, meaning that L1 may turn off any of these bits. The
3311          * reason is that if one of these bits is necessary, it will appear
3312          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
3313          * fields of vmcs01 and vmcs02, will turn these bits off - and
3314          * nested_vmx_exit_reflected() will not pass related exits to L1.
3315          * These rules have exceptions below.
3316          */
3317
3318         /* pin-based controls */
3319         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
3320                 msrs->pinbased_ctls_low,
3321                 msrs->pinbased_ctls_high);
3322         msrs->pinbased_ctls_low |=
3323                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3324         msrs->pinbased_ctls_high &=
3325                 PIN_BASED_EXT_INTR_MASK |
3326                 PIN_BASED_NMI_EXITING |
3327                 PIN_BASED_VIRTUAL_NMIS |
3328                 (apicv ? PIN_BASED_POSTED_INTR : 0);
3329         msrs->pinbased_ctls_high |=
3330                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3331                 PIN_BASED_VMX_PREEMPTION_TIMER;
3332
3333         /* exit controls */
3334         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
3335                 msrs->exit_ctls_low,
3336                 msrs->exit_ctls_high);
3337         msrs->exit_ctls_low =
3338                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3339
3340         msrs->exit_ctls_high &=
3341 #ifdef CONFIG_X86_64
3342                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
3343 #endif
3344                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
3345         msrs->exit_ctls_high |=
3346                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
3347                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
3348                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
3349
3350         if (kvm_mpx_supported())
3351                 msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
3352
3353         /* We support free control of debug control saving. */
3354         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
3355
3356         /* entry controls */
3357         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
3358                 msrs->entry_ctls_low,
3359                 msrs->entry_ctls_high);
3360         msrs->entry_ctls_low =
3361                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3362         msrs->entry_ctls_high &=
3363 #ifdef CONFIG_X86_64
3364                 VM_ENTRY_IA32E_MODE |
3365 #endif
3366                 VM_ENTRY_LOAD_IA32_PAT;
3367         msrs->entry_ctls_high |=
3368                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
3369         if (kvm_mpx_supported())
3370                 msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
3371
3372         /* We support free control of debug control loading. */
3373         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
3374
3375         /* cpu-based controls */
3376         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
3377                 msrs->procbased_ctls_low,
3378                 msrs->procbased_ctls_high);
3379         msrs->procbased_ctls_low =
3380                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3381         msrs->procbased_ctls_high &=
3382                 CPU_BASED_VIRTUAL_INTR_PENDING |
3383                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
3384                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
3385                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
3386                 CPU_BASED_CR3_STORE_EXITING |
3387 #ifdef CONFIG_X86_64
3388                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
3389 #endif
3390                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
3391                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
3392                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
3393                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
3394                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3395         /*
3396          * We can allow some features even when not supported by the
3397          * hardware. For example, L1 can specify an MSR bitmap - and we
3398          * can use it to avoid exits to L1 - even when L0 runs L2
3399          * without MSR bitmaps.
3400          */
3401         msrs->procbased_ctls_high |=
3402                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3403                 CPU_BASED_USE_MSR_BITMAPS;
3404
3405         /* We support free control of CR3 access interception. */
3406         msrs->procbased_ctls_low &=
3407                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
3408
3409         /*
3410          * secondary cpu-based controls.  Do not include those that
3411          * depend on CPUID bits, they are added later by vmx_cpuid_update.
3412          */
3413         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
3414                 msrs->secondary_ctls_low,
3415                 msrs->secondary_ctls_high);
3416         msrs->secondary_ctls_low = 0;
3417         msrs->secondary_ctls_high &=
3418                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3419                 SECONDARY_EXEC_DESC |
3420                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3421                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3422                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3423                 SECONDARY_EXEC_WBINVD_EXITING;
3424         /*
3425          * We can emulate "VMCS shadowing," even if the hardware
3426          * doesn't support it.
3427          */
3428         msrs->secondary_ctls_high |=
3429                 SECONDARY_EXEC_SHADOW_VMCS;
3430
3431         if (enable_ept) {
3432                 /* nested EPT: emulate EPT also to L1 */
3433                 msrs->secondary_ctls_high |=
3434                         SECONDARY_EXEC_ENABLE_EPT;
3435                 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
3436                          VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
3437                 if (cpu_has_vmx_ept_execute_only())
3438                         msrs->ept_caps |=
3439                                 VMX_EPT_EXECUTE_ONLY_BIT;
3440                 msrs->ept_caps &= vmx_capability.ept;
3441                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
3442                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
3443                         VMX_EPT_1GB_PAGE_BIT;
3444                 if (enable_ept_ad_bits) {
3445                         msrs->secondary_ctls_high |=
3446                                 SECONDARY_EXEC_ENABLE_PML;
3447                         msrs->ept_caps |= VMX_EPT_AD_BIT;
3448                 }
3449         }
3450
3451         if (cpu_has_vmx_vmfunc()) {
3452                 msrs->secondary_ctls_high |=
3453                         SECONDARY_EXEC_ENABLE_VMFUNC;
3454                 /*
3455                  * Advertise EPTP switching unconditionally
3456                  * since we emulate it
3457                  */
3458                 if (enable_ept)
3459                         msrs->vmfunc_controls =
3460                                 VMX_VMFUNC_EPTP_SWITCHING;
3461         }
3462
3463         /*
3464          * Old versions of KVM use the single-context version without
3465          * checking for support, so declare that it is supported even
3466          * though it is treated as global context.  The alternative is
3467          * not failing the single-context invvpid, and it is worse.
3468          */
3469         if (enable_vpid) {
3470                 msrs->secondary_ctls_high |=
3471                         SECONDARY_EXEC_ENABLE_VPID;
3472                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
3473                         VMX_VPID_EXTENT_SUPPORTED_MASK;
3474         }
3475
3476         if (enable_unrestricted_guest)
3477                 msrs->secondary_ctls_high |=
3478                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
3479
3480         /* miscellaneous data */
3481         rdmsr(MSR_IA32_VMX_MISC,
3482                 msrs->misc_low,
3483                 msrs->misc_high);
3484         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
3485         msrs->misc_low |=
3486                 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
3487                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
3488                 VMX_MISC_ACTIVITY_HLT;
3489         msrs->misc_high = 0;
3490
3491         /*
3492          * This MSR reports some information about VMX support. We
3493          * should return information about the VMX we emulate for the
3494          * guest, and the VMCS structure we give it - not about the
3495          * VMX support of the underlying hardware.
3496          */
3497         msrs->basic =
3498                 VMCS12_REVISION |
3499                 VMX_BASIC_TRUE_CTLS |
3500                 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
3501                 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
3502
3503         if (cpu_has_vmx_basic_inout())
3504                 msrs->basic |= VMX_BASIC_INOUT;
3505
3506         /*
3507          * These MSRs specify bits which the guest must keep fixed on
3508          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
3509          * We picked the standard core2 setting.
3510          */
3511 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
3512 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
3513         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
3514         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
3515
3516         /* These MSRs specify bits which the guest must keep fixed off. */
3517         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
3518         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
3519
3520         /* highest index: VMX_PREEMPTION_TIMER_VALUE */
3521         msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
3522 }
3523
3524 /*
3525  * if fixed0[i] == 1: val[i] must be 1
3526  * if fixed1[i] == 0: val[i] must be 0
3527  */
3528 static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
3529 {
3530         return ((val & fixed1) | fixed0) == val;
3531 }
3532
3533 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
3534 {
3535         return fixed_bits_valid(control, low, high);
3536 }
3537
3538 static inline u64 vmx_control_msr(u32 low, u32 high)
3539 {
3540         return low | ((u64)high << 32);
3541 }
3542
3543 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
3544 {
3545         superset &= mask;
3546         subset &= mask;
3547
3548         return (superset | subset) == superset;
3549 }
3550
3551 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
3552 {
3553         const u64 feature_and_reserved =
3554                 /* feature (except bit 48; see below) */
3555                 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
3556                 /* reserved */
3557                 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
3558         u64 vmx_basic = vmx->nested.msrs.basic;
3559
3560         if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
3561                 return -EINVAL;
3562
3563         /*
3564          * KVM does not emulate a version of VMX that constrains physical
3565          * addresses of VMX structures (e.g. VMCS) to 32-bits.
3566          */
3567         if (data & BIT_ULL(48))
3568                 return -EINVAL;
3569
3570         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
3571             vmx_basic_vmcs_revision_id(data))
3572                 return -EINVAL;
3573
3574         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
3575                 return -EINVAL;
3576
3577         vmx->nested.msrs.basic = data;
3578         return 0;
3579 }
3580
3581 static int
3582 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3583 {
3584         u64 supported;
3585         u32 *lowp, *highp;
3586
3587         switch (msr_index) {
3588         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3589                 lowp = &vmx->nested.msrs.pinbased_ctls_low;
3590                 highp = &vmx->nested.msrs.pinbased_ctls_high;
3591                 break;
3592         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3593                 lowp = &vmx->nested.msrs.procbased_ctls_low;
3594                 highp = &vmx->nested.msrs.procbased_ctls_high;
3595                 break;
3596         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3597                 lowp = &vmx->nested.msrs.exit_ctls_low;
3598                 highp = &vmx->nested.msrs.exit_ctls_high;
3599                 break;
3600         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3601                 lowp = &vmx->nested.msrs.entry_ctls_low;
3602                 highp = &vmx->nested.msrs.entry_ctls_high;
3603                 break;
3604         case MSR_IA32_VMX_PROCBASED_CTLS2:
3605                 lowp = &vmx->nested.msrs.secondary_ctls_low;
3606                 highp = &vmx->nested.msrs.secondary_ctls_high;
3607                 break;
3608         default:
3609                 BUG();
3610         }
3611
3612         supported = vmx_control_msr(*lowp, *highp);
3613
3614         /* Check must-be-1 bits are still 1. */
3615         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3616                 return -EINVAL;
3617
3618         /* Check must-be-0 bits are still 0. */
3619         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3620                 return -EINVAL;
3621
3622         *lowp = data;
3623         *highp = data >> 32;
3624         return 0;
3625 }
3626
3627 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3628 {
3629         const u64 feature_and_reserved_bits =
3630                 /* feature */
3631                 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3632                 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3633                 /* reserved */
3634                 GENMASK_ULL(13, 9) | BIT_ULL(31);
3635         u64 vmx_misc;
3636
3637         vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,