2 * Kernel-based Virtual Machine driver for Linux
4 * derived from drivers/kvm/kvm_main.c
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Amit Shah <amit.shah@qumranet.com>
15 * Ben-Ami Yassour <benami@il.ibm.com>
17 * This work is licensed under the terms of the GNU GPL, version 2. See
18 * the COPYING file in the top-level directory.
22 #include <linux/kvm_host.h>
27 #include "kvm_cache_regs.h"
33 #include <linux/clocksource.h>
34 #include <linux/interrupt.h>
35 #include <linux/kvm.h>
37 #include <linux/vmalloc.h>
38 #include <linux/export.h>
39 #include <linux/moduleparam.h>
40 #include <linux/mman.h>
41 #include <linux/highmem.h>
42 #include <linux/iommu.h>
43 #include <linux/intel-iommu.h>
44 #include <linux/cpufreq.h>
45 #include <linux/user-return-notifier.h>
46 #include <linux/srcu.h>
47 #include <linux/slab.h>
48 #include <linux/perf_event.h>
49 #include <linux/uaccess.h>
50 #include <linux/hash.h>
51 #include <linux/pci.h>
52 #include <linux/timekeeper_internal.h>
53 #include <linux/pvclock_gtod.h>
54 #include <linux/kvm_irqfd.h>
55 #include <linux/irqbypass.h>
56 #include <linux/sched/stat.h>
57 #include <linux/mem_encrypt.h>
59 #include <trace/events/kvm.h>
61 #include <asm/debugreg.h>
65 #include <linux/kernel_stat.h>
66 #include <asm/fpu/internal.h> /* Ugh! */
67 #include <asm/pvclock.h>
68 #include <asm/div64.h>
69 #include <asm/irq_remapping.h>
70 #include <asm/mshyperv.h>
71 #include <asm/hypervisor.h>
73 #define CREATE_TRACE_POINTS
76 #define MAX_IO_MSRS 256
77 #define KVM_MAX_MCE_BANKS 32
78 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
79 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
81 #define emul_to_vcpu(ctxt) \
82 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
85 * - enable syscall per default because its emulated by KVM
86 * - enable LME and LMA per default on 64 bit KVM
90 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
95 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
96 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
99 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
101 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102 static void process_nmi(struct kvm_vcpu *vcpu);
103 static void enter_smm(struct kvm_vcpu *vcpu);
104 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105 static void store_regs(struct kvm_vcpu *vcpu);
106 static int sync_regs(struct kvm_vcpu *vcpu);
108 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
109 EXPORT_SYMBOL_GPL(kvm_x86_ops);
111 static bool __read_mostly ignore_msrs = 0;
112 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
114 static bool __read_mostly report_ignored_msrs = true;
115 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
117 unsigned int min_timer_period_us = 200;
118 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
120 static bool __read_mostly kvmclock_periodic_sync = true;
121 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
123 bool __read_mostly kvm_has_tsc_control;
124 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
125 u32 __read_mostly kvm_max_guest_tsc_khz;
126 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
127 u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
128 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
129 u64 __read_mostly kvm_max_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
131 u64 __read_mostly kvm_default_tsc_scaling_ratio;
132 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
134 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
135 static u32 __read_mostly tsc_tolerance_ppm = 250;
136 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
138 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
139 unsigned int __read_mostly lapic_timer_advance_ns = 0;
140 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
141 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
143 static bool __read_mostly vector_hashing = true;
144 module_param(vector_hashing, bool, S_IRUGO);
146 bool __read_mostly enable_vmware_backdoor = false;
147 module_param(enable_vmware_backdoor, bool, S_IRUGO);
148 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
150 static bool __read_mostly force_emulation_prefix = false;
151 module_param(force_emulation_prefix, bool, S_IRUGO);
153 #define KVM_NR_SHARED_MSRS 16
155 struct kvm_shared_msrs_global {
157 u32 msrs[KVM_NR_SHARED_MSRS];
160 struct kvm_shared_msrs {
161 struct user_return_notifier urn;
163 struct kvm_shared_msr_values {
166 } values[KVM_NR_SHARED_MSRS];
169 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
170 static struct kvm_shared_msrs __percpu *shared_msrs;
172 struct kvm_stats_debugfs_item debugfs_entries[] = {
173 { "pf_fixed", VCPU_STAT(pf_fixed) },
174 { "pf_guest", VCPU_STAT(pf_guest) },
175 { "tlb_flush", VCPU_STAT(tlb_flush) },
176 { "invlpg", VCPU_STAT(invlpg) },
177 { "exits", VCPU_STAT(exits) },
178 { "io_exits", VCPU_STAT(io_exits) },
179 { "mmio_exits", VCPU_STAT(mmio_exits) },
180 { "signal_exits", VCPU_STAT(signal_exits) },
181 { "irq_window", VCPU_STAT(irq_window_exits) },
182 { "nmi_window", VCPU_STAT(nmi_window_exits) },
183 { "halt_exits", VCPU_STAT(halt_exits) },
184 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
185 { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
186 { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
187 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
188 { "hypercalls", VCPU_STAT(hypercalls) },
189 { "request_irq", VCPU_STAT(request_irq_exits) },
190 { "irq_exits", VCPU_STAT(irq_exits) },
191 { "host_state_reload", VCPU_STAT(host_state_reload) },
192 { "fpu_reload", VCPU_STAT(fpu_reload) },
193 { "insn_emulation", VCPU_STAT(insn_emulation) },
194 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
195 { "irq_injections", VCPU_STAT(irq_injections) },
196 { "nmi_injections", VCPU_STAT(nmi_injections) },
197 { "req_event", VCPU_STAT(req_event) },
198 { "l1d_flush", VCPU_STAT(l1d_flush) },
199 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
200 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
201 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
202 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
203 { "mmu_flooded", VM_STAT(mmu_flooded) },
204 { "mmu_recycled", VM_STAT(mmu_recycled) },
205 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
206 { "mmu_unsync", VM_STAT(mmu_unsync) },
207 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
208 { "largepages", VM_STAT(lpages) },
209 { "max_mmu_page_hash_collisions",
210 VM_STAT(max_mmu_page_hash_collisions) },
214 u64 __read_mostly host_xcr0;
216 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
218 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
221 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
222 vcpu->arch.apf.gfns[i] = ~0;
225 static void kvm_on_user_return(struct user_return_notifier *urn)
228 struct kvm_shared_msrs *locals
229 = container_of(urn, struct kvm_shared_msrs, urn);
230 struct kvm_shared_msr_values *values;
234 * Disabling irqs at this point since the following code could be
235 * interrupted and executed through kvm_arch_hardware_disable()
237 local_irq_save(flags);
238 if (locals->registered) {
239 locals->registered = false;
240 user_return_notifier_unregister(urn);
242 local_irq_restore(flags);
243 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
244 values = &locals->values[slot];
245 if (values->host != values->curr) {
246 wrmsrl(shared_msrs_global.msrs[slot], values->host);
247 values->curr = values->host;
252 static void shared_msr_update(unsigned slot, u32 msr)
255 unsigned int cpu = smp_processor_id();
256 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
258 /* only read, and nobody should modify it at this time,
259 * so don't need lock */
260 if (slot >= shared_msrs_global.nr) {
261 printk(KERN_ERR "kvm: invalid MSR slot!");
264 rdmsrl_safe(msr, &value);
265 smsr->values[slot].host = value;
266 smsr->values[slot].curr = value;
269 void kvm_define_shared_msr(unsigned slot, u32 msr)
271 BUG_ON(slot >= KVM_NR_SHARED_MSRS);
272 shared_msrs_global.msrs[slot] = msr;
273 if (slot >= shared_msrs_global.nr)
274 shared_msrs_global.nr = slot + 1;
276 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
278 static void kvm_shared_msr_cpu_online(void)
282 for (i = 0; i < shared_msrs_global.nr; ++i)
283 shared_msr_update(i, shared_msrs_global.msrs[i]);
286 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
288 unsigned int cpu = smp_processor_id();
289 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
292 if (((value ^ smsr->values[slot].curr) & mask) == 0)
294 smsr->values[slot].curr = value;
295 err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
299 if (!smsr->registered) {
300 smsr->urn.on_user_return = kvm_on_user_return;
301 user_return_notifier_register(&smsr->urn);
302 smsr->registered = true;
306 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
308 static void drop_user_return_notifiers(void)
310 unsigned int cpu = smp_processor_id();
311 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
313 if (smsr->registered)
314 kvm_on_user_return(&smsr->urn);
317 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
319 return vcpu->arch.apic_base;
321 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
323 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
325 return kvm_apic_mode(kvm_get_apic_base(vcpu));
327 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
329 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
331 enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
332 enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
333 u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
334 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
336 if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
338 if (!msr_info->host_initiated) {
339 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
341 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
345 kvm_lapic_set_base(vcpu, msr_info->data);
348 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
350 asmlinkage __visible void kvm_spurious_fault(void)
352 /* Fault while not rebooting. We want the trace. */
355 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
357 #define EXCPT_BENIGN 0
358 #define EXCPT_CONTRIBUTORY 1
361 static int exception_class(int vector)
371 return EXCPT_CONTRIBUTORY;
378 #define EXCPT_FAULT 0
380 #define EXCPT_ABORT 2
381 #define EXCPT_INTERRUPT 3
383 static int exception_type(int vector)
387 if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
388 return EXCPT_INTERRUPT;
392 /* #DB is trap, as instruction watchpoints are handled elsewhere */
393 if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
396 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
399 /* Reserved exceptions will result in fault */
403 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
404 unsigned nr, bool has_error, u32 error_code,
410 kvm_make_request(KVM_REQ_EVENT, vcpu);
412 if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
414 if (has_error && !is_protmode(vcpu))
418 * On vmentry, vcpu->arch.exception.pending is only
419 * true if an event injection was blocked by
420 * nested_run_pending. In that case, however,
421 * vcpu_enter_guest requests an immediate exit,
422 * and the guest shouldn't proceed far enough to
425 WARN_ON_ONCE(vcpu->arch.exception.pending);
426 vcpu->arch.exception.injected = true;
428 vcpu->arch.exception.pending = true;
429 vcpu->arch.exception.injected = false;
431 vcpu->arch.exception.has_error_code = has_error;
432 vcpu->arch.exception.nr = nr;
433 vcpu->arch.exception.error_code = error_code;
437 /* to check exception */
438 prev_nr = vcpu->arch.exception.nr;
439 if (prev_nr == DF_VECTOR) {
440 /* triple fault -> shutdown */
441 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
444 class1 = exception_class(prev_nr);
445 class2 = exception_class(nr);
446 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
447 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
449 * Generate double fault per SDM Table 5-5. Set
450 * exception.pending = true so that the double fault
451 * can trigger a nested vmexit.
453 vcpu->arch.exception.pending = true;
454 vcpu->arch.exception.injected = false;
455 vcpu->arch.exception.has_error_code = true;
456 vcpu->arch.exception.nr = DF_VECTOR;
457 vcpu->arch.exception.error_code = 0;
459 /* replace previous exception with a new one in a hope
460 that instruction re-execution will regenerate lost
465 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
467 kvm_multiple_exception(vcpu, nr, false, 0, false);
469 EXPORT_SYMBOL_GPL(kvm_queue_exception);
471 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
473 kvm_multiple_exception(vcpu, nr, false, 0, true);
475 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
477 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
480 kvm_inject_gp(vcpu, 0);
482 return kvm_skip_emulated_instruction(vcpu);
486 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
488 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
490 ++vcpu->stat.pf_guest;
491 vcpu->arch.exception.nested_apf =
492 is_guest_mode(vcpu) && fault->async_page_fault;
493 if (vcpu->arch.exception.nested_apf)
494 vcpu->arch.apf.nested_apf_token = fault->address;
496 vcpu->arch.cr2 = fault->address;
497 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
499 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
501 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
503 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
504 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
506 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
508 return fault->nested_page_fault;
511 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
513 atomic_inc(&vcpu->arch.nmi_queued);
514 kvm_make_request(KVM_REQ_NMI, vcpu);
516 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
518 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
520 kvm_multiple_exception(vcpu, nr, true, error_code, false);
522 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
524 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
526 kvm_multiple_exception(vcpu, nr, true, error_code, true);
528 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
531 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
532 * a #GP and return false.
534 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
536 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
538 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
541 EXPORT_SYMBOL_GPL(kvm_require_cpl);
543 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
545 if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
548 kvm_queue_exception(vcpu, UD_VECTOR);
551 EXPORT_SYMBOL_GPL(kvm_require_dr);
554 * This function will be used to read from the physical memory of the currently
555 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
556 * can read from guest physical or from the guest's guest physical memory.
558 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
559 gfn_t ngfn, void *data, int offset, int len,
562 struct x86_exception exception;
566 ngpa = gfn_to_gpa(ngfn);
567 real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
568 if (real_gfn == UNMAPPED_GVA)
571 real_gfn = gpa_to_gfn(real_gfn);
573 return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
575 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
577 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
578 void *data, int offset, int len, u32 access)
580 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
581 data, offset, len, access);
585 * Load the pae pdptrs. Return true is they are all valid.
587 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
589 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
590 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
593 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
595 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
596 offset * sizeof(u64), sizeof(pdpte),
597 PFERR_USER_MASK|PFERR_WRITE_MASK);
602 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
603 if ((pdpte[i] & PT_PRESENT_MASK) &&
605 vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
612 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
613 __set_bit(VCPU_EXREG_PDPTR,
614 (unsigned long *)&vcpu->arch.regs_avail);
615 __set_bit(VCPU_EXREG_PDPTR,
616 (unsigned long *)&vcpu->arch.regs_dirty);
621 EXPORT_SYMBOL_GPL(load_pdptrs);
623 bool pdptrs_changed(struct kvm_vcpu *vcpu)
625 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
631 if (is_long_mode(vcpu) || !is_pae(vcpu))
634 if (!test_bit(VCPU_EXREG_PDPTR,
635 (unsigned long *)&vcpu->arch.regs_avail))
638 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
639 offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
640 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
641 PFERR_USER_MASK | PFERR_WRITE_MASK);
644 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
649 EXPORT_SYMBOL_GPL(pdptrs_changed);
651 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
653 unsigned long old_cr0 = kvm_read_cr0(vcpu);
654 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
659 if (cr0 & 0xffffffff00000000UL)
663 cr0 &= ~CR0_RESERVED_BITS;
665 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
668 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
671 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
673 if ((vcpu->arch.efer & EFER_LME)) {
678 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
683 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
688 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
691 kvm_x86_ops->set_cr0(vcpu, cr0);
693 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
694 kvm_clear_async_pf_completion_queue(vcpu);
695 kvm_async_pf_hash_reset(vcpu);
698 if ((cr0 ^ old_cr0) & update_bits)
699 kvm_mmu_reset_context(vcpu);
701 if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
702 kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
703 !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
704 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
708 EXPORT_SYMBOL_GPL(kvm_set_cr0);
710 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
712 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
714 EXPORT_SYMBOL_GPL(kvm_lmsw);
716 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
718 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
719 !vcpu->guest_xcr0_loaded) {
720 /* kvm_set_xcr() also depends on this */
721 if (vcpu->arch.xcr0 != host_xcr0)
722 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
723 vcpu->guest_xcr0_loaded = 1;
727 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
729 if (vcpu->guest_xcr0_loaded) {
730 if (vcpu->arch.xcr0 != host_xcr0)
731 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
732 vcpu->guest_xcr0_loaded = 0;
736 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
739 u64 old_xcr0 = vcpu->arch.xcr0;
742 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
743 if (index != XCR_XFEATURE_ENABLED_MASK)
745 if (!(xcr0 & XFEATURE_MASK_FP))
747 if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
751 * Do not allow the guest to set bits that we do not support
752 * saving. However, xcr0 bit 0 is always set, even if the
753 * emulated CPU does not support XSAVE (see fx_init).
755 valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
756 if (xcr0 & ~valid_bits)
759 if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
760 (!(xcr0 & XFEATURE_MASK_BNDCSR)))
763 if (xcr0 & XFEATURE_MASK_AVX512) {
764 if (!(xcr0 & XFEATURE_MASK_YMM))
766 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
769 vcpu->arch.xcr0 = xcr0;
771 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
772 kvm_update_cpuid(vcpu);
776 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
778 if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
779 __kvm_set_xcr(vcpu, index, xcr)) {
780 kvm_inject_gp(vcpu, 0);
785 EXPORT_SYMBOL_GPL(kvm_set_xcr);
787 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
789 unsigned long old_cr4 = kvm_read_cr4(vcpu);
790 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
791 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
793 if (cr4 & CR4_RESERVED_BITS)
796 if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
799 if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
802 if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
805 if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
808 if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
811 if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
814 if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
817 if (is_long_mode(vcpu)) {
818 if (!(cr4 & X86_CR4_PAE))
820 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
821 && ((cr4 ^ old_cr4) & pdptr_bits)
822 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
826 if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
827 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
830 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
831 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
835 if (kvm_x86_ops->set_cr4(vcpu, cr4))
838 if (((cr4 ^ old_cr4) & pdptr_bits) ||
839 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
840 kvm_mmu_reset_context(vcpu);
842 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
843 kvm_update_cpuid(vcpu);
847 EXPORT_SYMBOL_GPL(kvm_set_cr4);
849 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
851 bool skip_tlb_flush = false;
853 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
856 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
857 cr3 &= ~X86_CR3_PCID_NOFLUSH;
861 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
862 if (!skip_tlb_flush) {
863 kvm_mmu_sync_roots(vcpu);
864 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
869 if (is_long_mode(vcpu) &&
870 (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
872 else if (is_pae(vcpu) && is_paging(vcpu) &&
873 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
876 kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
877 vcpu->arch.cr3 = cr3;
878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
882 EXPORT_SYMBOL_GPL(kvm_set_cr3);
884 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
886 if (cr8 & CR8_RESERVED_BITS)
888 if (lapic_in_kernel(vcpu))
889 kvm_lapic_set_tpr(vcpu, cr8);
891 vcpu->arch.cr8 = cr8;
894 EXPORT_SYMBOL_GPL(kvm_set_cr8);
896 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
898 if (lapic_in_kernel(vcpu))
899 return kvm_lapic_get_cr8(vcpu);
901 return vcpu->arch.cr8;
903 EXPORT_SYMBOL_GPL(kvm_get_cr8);
905 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
909 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
910 for (i = 0; i < KVM_NR_DB_REGS; i++)
911 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
912 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
916 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
918 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
919 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
922 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
926 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
927 dr7 = vcpu->arch.guest_debug_dr7;
929 dr7 = vcpu->arch.dr7;
930 kvm_x86_ops->set_dr7(vcpu, dr7);
931 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
932 if (dr7 & DR7_BP_EN_MASK)
933 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
936 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
938 u64 fixed = DR6_FIXED_1;
940 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
945 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
949 vcpu->arch.db[dr] = val;
950 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
951 vcpu->arch.eff_db[dr] = val;
956 if (val & 0xffffffff00000000ULL)
958 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
959 kvm_update_dr6(vcpu);
964 if (val & 0xffffffff00000000ULL)
966 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
967 kvm_update_dr7(vcpu);
974 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
976 if (__kvm_set_dr(vcpu, dr, val)) {
977 kvm_inject_gp(vcpu, 0);
982 EXPORT_SYMBOL_GPL(kvm_set_dr);
984 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
988 *val = vcpu->arch.db[dr];
993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
994 *val = vcpu->arch.dr6;
996 *val = kvm_x86_ops->get_dr6(vcpu);
1001 *val = vcpu->arch.dr7;
1006 EXPORT_SYMBOL_GPL(kvm_get_dr);
1008 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1010 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1014 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1017 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1018 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1021 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1024 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1025 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1027 * This list is modified at module load time to reflect the
1028 * capabilities of the host cpu. This capabilities test skips MSRs that are
1029 * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1030 * may depend on host virtualization features rather than host cpu features.
1033 static u32 msrs_to_save[] = {
1034 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1036 #ifdef CONFIG_X86_64
1037 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1039 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1040 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1041 MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1044 static unsigned num_msrs_to_save;
1046 static u32 emulated_msrs[] = {
1047 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1048 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1049 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1050 HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1051 HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1052 HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1053 HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1055 HV_X64_MSR_VP_INDEX,
1056 HV_X64_MSR_VP_RUNTIME,
1057 HV_X64_MSR_SCONTROL,
1058 HV_X64_MSR_STIMER0_CONFIG,
1059 HV_X64_MSR_VP_ASSIST_PAGE,
1060 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1061 HV_X64_MSR_TSC_EMULATION_STATUS,
1063 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1066 MSR_IA32_TSC_ADJUST,
1067 MSR_IA32_TSCDEADLINE,
1068 MSR_IA32_MISC_ENABLE,
1069 MSR_IA32_MCG_STATUS,
1071 MSR_IA32_MCG_EXT_CTL,
1075 MSR_MISC_FEATURES_ENABLES,
1076 MSR_AMD64_VIRT_SPEC_CTRL,
1079 static unsigned num_emulated_msrs;
1082 * List of msr numbers which are used to expose MSR-based features that
1083 * can be used by a hypervisor to validate requested CPU features.
1085 static u32 msr_based_features[] = {
1087 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1088 MSR_IA32_VMX_PINBASED_CTLS,
1089 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1090 MSR_IA32_VMX_PROCBASED_CTLS,
1091 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1092 MSR_IA32_VMX_EXIT_CTLS,
1093 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1094 MSR_IA32_VMX_ENTRY_CTLS,
1096 MSR_IA32_VMX_CR0_FIXED0,
1097 MSR_IA32_VMX_CR0_FIXED1,
1098 MSR_IA32_VMX_CR4_FIXED0,
1099 MSR_IA32_VMX_CR4_FIXED1,
1100 MSR_IA32_VMX_VMCS_ENUM,
1101 MSR_IA32_VMX_PROCBASED_CTLS2,
1102 MSR_IA32_VMX_EPT_VPID_CAP,
1103 MSR_IA32_VMX_VMFUNC,
1107 MSR_IA32_ARCH_CAPABILITIES,
1110 static unsigned int num_msr_based_features;
1112 u64 kvm_get_arch_capabilities(void)
1116 rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1119 * If we're doing cache flushes (either "always" or "cond")
1120 * we will do one whenever the guest does a vmlaunch/vmresume.
1121 * If an outer hypervisor is doing the cache flush for us
1122 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1123 * capability to the guest too, and if EPT is disabled we're not
1124 * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
1125 * require a nested hypervisor to do a flush of its own.
1127 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1128 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1132 EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1134 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1136 switch (msr->index) {
1137 case MSR_IA32_ARCH_CAPABILITIES:
1138 msr->data = kvm_get_arch_capabilities();
1140 case MSR_IA32_UCODE_REV:
1141 rdmsrl_safe(msr->index, &msr->data);
1144 if (kvm_x86_ops->get_msr_feature(msr))
1150 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1152 struct kvm_msr_entry msr;
1156 r = kvm_get_msr_feature(&msr);
1165 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1167 if (efer & efer_reserved_bits)
1170 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1173 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1178 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1180 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
1182 u64 old_efer = vcpu->arch.efer;
1184 if (!kvm_valid_efer(vcpu, efer))
1188 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1192 efer |= vcpu->arch.efer & EFER_LMA;
1194 kvm_x86_ops->set_efer(vcpu, efer);
1196 /* Update reserved bits */
1197 if ((efer ^ old_efer) & EFER_NX)
1198 kvm_mmu_reset_context(vcpu);
1203 void kvm_enable_efer_bits(u64 mask)
1205 efer_reserved_bits &= ~mask;
1207 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1210 * Writes msr value into into the appropriate "register".
1211 * Returns 0 on success, non-0 otherwise.
1212 * Assumes vcpu_load() was already called.
1214 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1216 switch (msr->index) {
1219 case MSR_KERNEL_GS_BASE:
1222 if (is_noncanonical_address(msr->data, vcpu))
1225 case MSR_IA32_SYSENTER_EIP:
1226 case MSR_IA32_SYSENTER_ESP:
1228 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1229 * non-canonical address is written on Intel but not on
1230 * AMD (which ignores the top 32-bits, because it does
1231 * not implement 64-bit SYSENTER).
1233 * 64-bit code should hence be able to write a non-canonical
1234 * value on AMD. Making the address canonical ensures that
1235 * vmentry does not fail on Intel after writing a non-canonical
1236 * value, and that something deterministic happens if the guest
1237 * invokes 64-bit SYSENTER.
1239 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1241 return kvm_x86_ops->set_msr(vcpu, msr);
1243 EXPORT_SYMBOL_GPL(kvm_set_msr);
1246 * Adapt set_msr() to msr_io()'s calling convention
1248 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1250 struct msr_data msr;
1254 msr.host_initiated = true;
1255 r = kvm_get_msr(vcpu, &msr);
1263 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1265 struct msr_data msr;
1269 msr.host_initiated = true;
1270 return kvm_set_msr(vcpu, &msr);
1273 #ifdef CONFIG_X86_64
1274 struct pvclock_gtod_data {
1277 struct { /* extract of a clocksource struct */
1290 static struct pvclock_gtod_data pvclock_gtod_data;
1292 static void update_pvclock_gtod(struct timekeeper *tk)
1294 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1297 boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1299 write_seqcount_begin(&vdata->seq);
1301 /* copy pvclock gtod data */
1302 vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
1303 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
1304 vdata->clock.mask = tk->tkr_mono.mask;
1305 vdata->clock.mult = tk->tkr_mono.mult;
1306 vdata->clock.shift = tk->tkr_mono.shift;
1308 vdata->boot_ns = boot_ns;
1309 vdata->nsec_base = tk->tkr_mono.xtime_nsec;
1311 vdata->wall_time_sec = tk->xtime_sec;
1313 write_seqcount_end(&vdata->seq);
1317 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1320 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1321 * vcpu_enter_guest. This function is only called from
1322 * the physical CPU that is running vcpu.
1324 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1327 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1331 struct pvclock_wall_clock wc;
1332 struct timespec64 boot;
1337 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1342 ++version; /* first time write, random junk */
1346 if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1350 * The guest calculates current wall clock time by adding
1351 * system time (updated by kvm_guest_time_update below) to the
1352 * wall clock specified here. guest system time equals host
1353 * system time for us, thus we must fill in host boot time here.
1355 getboottime64(&boot);
1357 if (kvm->arch.kvmclock_offset) {
1358 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1359 boot = timespec64_sub(boot, ts);
1361 wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1362 wc.nsec = boot.tv_nsec;
1363 wc.version = version;
1365 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1368 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1371 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1373 do_shl32_div32(dividend, divisor);
1377 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1378 s8 *pshift, u32 *pmultiplier)
1386 scaled64 = scaled_hz;
1387 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1392 tps32 = (uint32_t)tps64;
1393 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1394 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1402 *pmultiplier = div_frac(scaled64, tps32);
1404 pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1405 __func__, base_hz, scaled_hz, shift, *pmultiplier);
1408 #ifdef CONFIG_X86_64
1409 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1412 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1413 static unsigned long max_tsc_khz;
1415 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1417 u64 v = (u64)khz * (1000000 + ppm);
1422 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1426 /* Guest TSC same frequency as host TSC? */
1428 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1432 /* TSC scaling supported? */
1433 if (!kvm_has_tsc_control) {
1434 if (user_tsc_khz > tsc_khz) {
1435 vcpu->arch.tsc_catchup = 1;
1436 vcpu->arch.tsc_always_catchup = 1;
1439 WARN(1, "user requested TSC rate below hardware speed\n");
1444 /* TSC scaling required - calculate ratio */
1445 ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1446 user_tsc_khz, tsc_khz);
1448 if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1449 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1454 vcpu->arch.tsc_scaling_ratio = ratio;
1458 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1460 u32 thresh_lo, thresh_hi;
1461 int use_scaling = 0;
1463 /* tsc_khz can be zero if TSC calibration fails */
1464 if (user_tsc_khz == 0) {
1465 /* set tsc_scaling_ratio to a safe value */
1466 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1470 /* Compute a scale to convert nanoseconds in TSC cycles */
1471 kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1472 &vcpu->arch.virtual_tsc_shift,
1473 &vcpu->arch.virtual_tsc_mult);
1474 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1477 * Compute the variation in TSC rate which is acceptable
1478 * within the range of tolerance and decide if the
1479 * rate being applied is within that bounds of the hardware
1480 * rate. If so, no scaling or compensation need be done.
1482 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1483 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1484 if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1485 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1488 return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1491 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1493 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1494 vcpu->arch.virtual_tsc_mult,
1495 vcpu->arch.virtual_tsc_shift);
1496 tsc += vcpu->arch.this_tsc_write;
1500 static inline int gtod_is_based_on_tsc(int mode)
1502 return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1505 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1507 #ifdef CONFIG_X86_64
1509 struct kvm_arch *ka = &vcpu->kvm->arch;
1510 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1512 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1513 atomic_read(&vcpu->kvm->online_vcpus));
1516 * Once the masterclock is enabled, always perform request in
1517 * order to update it.
1519 * In order to enable masterclock, the host clocksource must be TSC
1520 * and the vcpus need to have matched TSCs. When that happens,
1521 * perform request to enable masterclock.
1523 if (ka->use_master_clock ||
1524 (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1525 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1527 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1528 atomic_read(&vcpu->kvm->online_vcpus),
1529 ka->use_master_clock, gtod->clock.vclock_mode);
1533 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1535 u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1536 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1540 * Multiply tsc by a fixed point number represented by ratio.
1542 * The most significant 64-N bits (mult) of ratio represent the
1543 * integral part of the fixed point number; the remaining N bits
1544 * (frac) represent the fractional part, ie. ratio represents a fixed
1545 * point number (mult + frac * 2^(-N)).
1547 * N equals to kvm_tsc_scaling_ratio_frac_bits.
1549 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1551 return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1554 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1557 u64 ratio = vcpu->arch.tsc_scaling_ratio;
1559 if (ratio != kvm_default_tsc_scaling_ratio)
1560 _tsc = __scale_tsc(ratio, tsc);
1564 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1566 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1570 tsc = kvm_scale_tsc(vcpu, rdtsc());
1572 return target_tsc - tsc;
1575 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1577 u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1579 return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1581 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1583 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1585 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1586 vcpu->arch.tsc_offset = offset;
1589 static inline bool kvm_check_tsc_unstable(void)
1591 #ifdef CONFIG_X86_64
1593 * TSC is marked unstable when we're running on Hyper-V,
1594 * 'TSC page' clocksource is good.
1596 if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1599 return check_tsc_unstable();
1602 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1604 struct kvm *kvm = vcpu->kvm;
1605 u64 offset, ns, elapsed;
1606 unsigned long flags;
1608 bool already_matched;
1609 u64 data = msr->data;
1610 bool synchronizing = false;
1612 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1613 offset = kvm_compute_tsc_offset(vcpu, data);
1614 ns = ktime_get_boot_ns();
1615 elapsed = ns - kvm->arch.last_tsc_nsec;
1617 if (vcpu->arch.virtual_tsc_khz) {
1618 if (data == 0 && msr->host_initiated) {
1620 * detection of vcpu initialization -- need to sync
1621 * with other vCPUs. This particularly helps to keep
1622 * kvm_clock stable after CPU hotplug
1624 synchronizing = true;
1626 u64 tsc_exp = kvm->arch.last_tsc_write +
1627 nsec_to_cycles(vcpu, elapsed);
1628 u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1630 * Special case: TSC write with a small delta (1 second)
1631 * of virtual cycle time against real time is
1632 * interpreted as an attempt to synchronize the CPU.
1634 synchronizing = data < tsc_exp + tsc_hz &&
1635 data + tsc_hz > tsc_exp;
1640 * For a reliable TSC, we can match TSC offsets, and for an unstable
1641 * TSC, we add elapsed time in this computation. We could let the
1642 * compensation code attempt to catch up if we fall behind, but
1643 * it's better to try to match offsets from the beginning.
1645 if (synchronizing &&
1646 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1647 if (!kvm_check_tsc_unstable()) {
1648 offset = kvm->arch.cur_tsc_offset;
1649 pr_debug("kvm: matched tsc offset for %llu\n", data);
1651 u64 delta = nsec_to_cycles(vcpu, elapsed);
1653 offset = kvm_compute_tsc_offset(vcpu, data);
1654 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1657 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1660 * We split periods of matched TSC writes into generations.
1661 * For each generation, we track the original measured
1662 * nanosecond time, offset, and write, so if TSCs are in
1663 * sync, we can match exact offset, and if not, we can match
1664 * exact software computation in compute_guest_tsc()
1666 * These values are tracked in kvm->arch.cur_xxx variables.
1668 kvm->arch.cur_tsc_generation++;
1669 kvm->arch.cur_tsc_nsec = ns;
1670 kvm->arch.cur_tsc_write = data;
1671 kvm->arch.cur_tsc_offset = offset;
1673 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1674 kvm->arch.cur_tsc_generation, data);
1678 * We also track th most recent recorded KHZ, write and time to
1679 * allow the matching interval to be extended at each write.
1681 kvm->arch.last_tsc_nsec = ns;
1682 kvm->arch.last_tsc_write = data;
1683 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1685 vcpu->arch.last_guest_tsc = data;
1687 /* Keep track of which generation this VCPU has synchronized to */
1688 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1689 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1690 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1692 if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1693 update_ia32_tsc_adjust_msr(vcpu, offset);
1695 kvm_vcpu_write_tsc_offset(vcpu, offset);
1696 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1698 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1700 kvm->arch.nr_vcpus_matched_tsc = 0;
1701 } else if (!already_matched) {
1702 kvm->arch.nr_vcpus_matched_tsc++;
1705 kvm_track_tsc_matching(vcpu);
1706 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1709 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1711 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1714 kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
1717 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1719 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1720 WARN_ON(adjustment < 0);
1721 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1722 adjust_tsc_offset_guest(vcpu, adjustment);
1725 #ifdef CONFIG_X86_64
1727 static u64 read_tsc(void)
1729 u64 ret = (u64)rdtsc_ordered();
1730 u64 last = pvclock_gtod_data.clock.cycle_last;
1732 if (likely(ret >= last))
1736 * GCC likes to generate cmov here, but this branch is extremely
1737 * predictable (it's just a function of time and the likely is
1738 * very likely) and there's a data dependence, so force GCC
1739 * to generate a branch instead. I don't barrier() because
1740 * we don't actually need a barrier, and if this function
1741 * ever gets inlined it will generate worse code.
1747 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1750 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1753 switch (gtod->clock.vclock_mode) {
1754 case VCLOCK_HVCLOCK:
1755 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1757 if (tsc_pg_val != U64_MAX) {
1758 /* TSC page valid */
1759 *mode = VCLOCK_HVCLOCK;
1760 v = (tsc_pg_val - gtod->clock.cycle_last) &
1763 /* TSC page invalid */
1764 *mode = VCLOCK_NONE;
1769 *tsc_timestamp = read_tsc();
1770 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1774 *mode = VCLOCK_NONE;
1777 if (*mode == VCLOCK_NONE)
1778 *tsc_timestamp = v = 0;
1780 return v * gtod->clock.mult;
1783 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1785 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1791 seq = read_seqcount_begin(>od->seq);
1792 ns = gtod->nsec_base;
1793 ns += vgettsc(tsc_timestamp, &mode);
1794 ns >>= gtod->clock.shift;
1795 ns += gtod->boot_ns;
1796 } while (unlikely(read_seqcount_retry(>od->seq, seq)));
1802 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
1804 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1810 seq = read_seqcount_begin(>od->seq);
1811 ts->tv_sec = gtod->wall_time_sec;
1812 ns = gtod->nsec_base;
1813 ns += vgettsc(tsc_timestamp, &mode);
1814 ns >>= gtod->clock.shift;
1815 } while (unlikely(read_seqcount_retry(>od->seq, seq)));
1817 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1823 /* returns true if host is using TSC based clocksource */
1824 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1826 /* checked again under seqlock below */
1827 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1830 return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1834 /* returns true if host is using TSC based clocksource */
1835 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
1838 /* checked again under seqlock below */
1839 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1842 return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1848 * Assuming a stable TSC across physical CPUS, and a stable TSC
1849 * across virtual CPUs, the following condition is possible.
1850 * Each numbered line represents an event visible to both
1851 * CPUs at the next numbered event.
1853 * "timespecX" represents host monotonic time. "tscX" represents
1856 * VCPU0 on CPU0 | VCPU1 on CPU1
1858 * 1. read timespec0,tsc0
1859 * 2. | timespec1 = timespec0 + N
1861 * 3. transition to guest | transition to guest
1862 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1863 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1864 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1866 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1869 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1871 * - 0 < N - M => M < N
1873 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1874 * always the case (the difference between two distinct xtime instances
1875 * might be smaller then the difference between corresponding TSC reads,
1876 * when updating guest vcpus pvclock areas).
1878 * To avoid that problem, do not allow visibility of distinct
1879 * system_timestamp/tsc_timestamp values simultaneously: use a master
1880 * copy of host monotonic time values. Update that master copy
1883 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1887 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1889 #ifdef CONFIG_X86_64
1890 struct kvm_arch *ka = &kvm->arch;
1892 bool host_tsc_clocksource, vcpus_matched;
1894 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1895 atomic_read(&kvm->online_vcpus));
1898 * If the host uses TSC clock, then passthrough TSC as stable
1901 host_tsc_clocksource = kvm_get_time_and_clockread(
1902 &ka->master_kernel_ns,
1903 &ka->master_cycle_now);
1905 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1906 && !ka->backwards_tsc_observed
1907 && !ka->boot_vcpu_runs_old_kvmclock;
1909 if (ka->use_master_clock)
1910 atomic_set(&kvm_guest_has_master_clock, 1);
1912 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1913 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1918 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
1920 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
1923 static void kvm_gen_update_masterclock(struct kvm *kvm)
1925 #ifdef CONFIG_X86_64
1927 struct kvm_vcpu *vcpu;
1928 struct kvm_arch *ka = &kvm->arch;
1930 spin_lock(&ka->pvclock_gtod_sync_lock);
1931 kvm_make_mclock_inprogress_request(kvm);
1932 /* no guest entries from this point */
1933 pvclock_update_vm_gtod_copy(kvm);
1935 kvm_for_each_vcpu(i, vcpu, kvm)
1936 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1938 /* guest entries allowed */
1939 kvm_for_each_vcpu(i, vcpu, kvm)
1940 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
1942 spin_unlock(&ka->pvclock_gtod_sync_lock);
1946 u64 get_kvmclock_ns(struct kvm *kvm)
1948 struct kvm_arch *ka = &kvm->arch;
1949 struct pvclock_vcpu_time_info hv_clock;
1952 spin_lock(&ka->pvclock_gtod_sync_lock);
1953 if (!ka->use_master_clock) {
1954 spin_unlock(&ka->pvclock_gtod_sync_lock);
1955 return ktime_get_boot_ns() + ka->kvmclock_offset;
1958 hv_clock.tsc_timestamp = ka->master_cycle_now;
1959 hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
1960 spin_unlock(&ka->pvclock_gtod_sync_lock);
1962 /* both __this_cpu_read() and rdtsc() should be on the same cpu */
1965 if (__this_cpu_read(cpu_tsc_khz)) {
1966 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
1967 &hv_clock.tsc_shift,
1968 &hv_clock.tsc_to_system_mul);
1969 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
1971 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
1978 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1980 struct kvm_vcpu_arch *vcpu = &v->arch;
1981 struct pvclock_vcpu_time_info guest_hv_clock;
1983 if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1984 &guest_hv_clock, sizeof(guest_hv_clock))))
1987 /* This VCPU is paused, but it's legal for a guest to read another
1988 * VCPU's kvmclock, so we really have to follow the specification where
1989 * it says that version is odd if data is being modified, and even after
1992 * Version field updates must be kept separate. This is because
1993 * kvm_write_guest_cached might use a "rep movs" instruction, and
1994 * writes within a string instruction are weakly ordered. So there
1995 * are three writes overall.
1997 * As a small optimization, only write the version field in the first
1998 * and third write. The vcpu->pv_time cache is still valid, because the
1999 * version field is the first in the struct.
2001 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2003 if (guest_hv_clock.version & 1)
2004 ++guest_hv_clock.version; /* first time write, random junk */
2006 vcpu->hv_clock.version = guest_hv_clock.version + 1;
2007 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2009 sizeof(vcpu->hv_clock.version));
2013 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2014 vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2016 if (vcpu->pvclock_set_guest_stopped_request) {
2017 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2018 vcpu->pvclock_set_guest_stopped_request = false;
2021 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2023 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2025 sizeof(vcpu->hv_clock));
2029 vcpu->hv_clock.version++;
2030 kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2032 sizeof(vcpu->hv_clock.version));
2035 static int kvm_guest_time_update(struct kvm_vcpu *v)
2037 unsigned long flags, tgt_tsc_khz;
2038 struct kvm_vcpu_arch *vcpu = &v->arch;
2039 struct kvm_arch *ka = &v->kvm->arch;
2041 u64 tsc_timestamp, host_tsc;
2043 bool use_master_clock;
2049 * If the host uses TSC clock, then passthrough TSC as stable
2052 spin_lock(&ka->pvclock_gtod_sync_lock);
2053 use_master_clock = ka->use_master_clock;
2054 if (use_master_clock) {
2055 host_tsc = ka->master_cycle_now;
2056 kernel_ns = ka->master_kernel_ns;
2058 spin_unlock(&ka->pvclock_gtod_sync_lock);
2060 /* Keep irq disabled to prevent changes to the clock */
2061 local_irq_save(flags);
2062 tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2063 if (unlikely(tgt_tsc_khz == 0)) {
2064 local_irq_restore(flags);
2065 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2068 if (!use_master_clock) {
2070 kernel_ns = ktime_get_boot_ns();
2073 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2076 * We may have to catch up the TSC to match elapsed wall clock
2077 * time for two reasons, even if kvmclock is used.
2078 * 1) CPU could have been running below the maximum TSC rate
2079 * 2) Broken TSC compensation resets the base at each VCPU
2080 * entry to avoid unknown leaps of TSC even when running
2081 * again on the same CPU. This may cause apparent elapsed
2082 * time to disappear, and the guest to stand still or run
2085 if (vcpu->tsc_catchup) {
2086 u64 tsc = compute_guest_tsc(v, kernel_ns);
2087 if (tsc > tsc_timestamp) {
2088 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2089 tsc_timestamp = tsc;
2093 local_irq_restore(flags);
2095 /* With all the info we got, fill in the values */
2097 if (kvm_has_tsc_control)
2098 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2100 if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2101 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2102 &vcpu->hv_clock.tsc_shift,
2103 &vcpu->hv_clock.tsc_to_system_mul);
2104 vcpu->hw_tsc_khz = tgt_tsc_khz;
2107 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2108 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2109 vcpu->last_guest_tsc = tsc_timestamp;
2111 /* If the host uses TSC clocksource, then it is stable */
2113 if (use_master_clock)
2114 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2116 vcpu->hv_clock.flags = pvclock_flags;
2118 if (vcpu->pv_time_enabled)
2119 kvm_setup_pvclock_page(v);
2120 if (v == kvm_get_vcpu(v->kvm, 0))
2121 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2126 * kvmclock updates which are isolated to a given vcpu, such as
2127 * vcpu->cpu migration, should not allow system_timestamp from
2128 * the rest of the vcpus to remain static. Otherwise ntp frequency
2129 * correction applies to one vcpu's system_timestamp but not
2132 * So in those cases, request a kvmclock update for all vcpus.
2133 * We need to rate-limit these requests though, as they can
2134 * considerably slow guests that have a large number of vcpus.
2135 * The time for a remote vcpu to update its kvmclock is bound
2136 * by the delay we use to rate-limit the updates.
2139 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2141 static void kvmclock_update_fn(struct work_struct *work)
2144 struct delayed_work *dwork = to_delayed_work(work);
2145 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2146 kvmclock_update_work);
2147 struct kvm *kvm = container_of(ka, struct kvm, arch);
2148 struct kvm_vcpu *vcpu;
2150 kvm_for_each_vcpu(i, vcpu, kvm) {
2151 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2152 kvm_vcpu_kick(vcpu);
2156 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2158 struct kvm *kvm = v->kvm;
2160 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2161 schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2162 KVMCLOCK_UPDATE_DELAY);
2165 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2167 static void kvmclock_sync_fn(struct work_struct *work)
2169 struct delayed_work *dwork = to_delayed_work(work);
2170 struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2171 kvmclock_sync_work);
2172 struct kvm *kvm = container_of(ka, struct kvm, arch);
2174 if (!kvmclock_periodic_sync)
2177 schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2178 schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2179 KVMCLOCK_SYNC_PERIOD);
2182 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2184 u64 mcg_cap = vcpu->arch.mcg_cap;
2185 unsigned bank_num = mcg_cap & 0xff;
2186 u32 msr = msr_info->index;
2187 u64 data = msr_info->data;
2190 case MSR_IA32_MCG_STATUS:
2191 vcpu->arch.mcg_status = data;
2193 case MSR_IA32_MCG_CTL:
2194 if (!(mcg_cap & MCG_CTL_P) &&
2195 (data || !msr_info->host_initiated))
2197 if (data != 0 && data != ~(u64)0)
2199 vcpu->arch.mcg_ctl = data;
2202 if (msr >= MSR_IA32_MC0_CTL &&
2203 msr < MSR_IA32_MCx_CTL(bank_num)) {
2204 u32 offset = msr - MSR_IA32_MC0_CTL;
2205 /* only 0 or all 1s can be written to IA32_MCi_CTL
2206 * some Linux kernels though clear bit 10 in bank 4 to
2207 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2208 * this to avoid an uncatched #GP in the guest
2210 if ((offset & 0x3) == 0 &&
2211 data != 0 && (data | (1 << 10)) != ~(u64)0)
2213 if (!msr_info->host_initiated &&
2214 (offset & 0x3) == 1 && data != 0)
2216 vcpu->arch.mce_banks[offset] = data;
2224 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2226 struct kvm *kvm = vcpu->kvm;
2227 int lm = is_long_mode(vcpu);
2228 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2229 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2230 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2231 : kvm->arch.xen_hvm_config.blob_size_32;
2232 u32 page_num = data & ~PAGE_MASK;
2233 u64 page_addr = data & PAGE_MASK;
2238 if (page_num >= blob_size)
2241 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2246 if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2255 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2257 gpa_t gpa = data & ~0x3f;
2259 /* Bits 3:5 are reserved, Should be zero */
2263 vcpu->arch.apf.msr_val = data;
2265 if (!(data & KVM_ASYNC_PF_ENABLED)) {
2266 kvm_clear_async_pf_completion_queue(vcpu);
2267 kvm_async_pf_hash_reset(vcpu);
2271 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2275 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2276 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2277 kvm_async_pf_wakeup_all(vcpu);
2281 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2283 vcpu->arch.pv_time_enabled = false;
2286 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2288 ++vcpu->stat.tlb_flush;
2289 kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2292 static void record_steal_time(struct kvm_vcpu *vcpu)
2294 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2297 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2298 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2302 * Doing a TLB flush here, on the guest's behalf, can avoid
2305 if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2306 kvm_vcpu_flush_tlb(vcpu, false);
2308 if (vcpu->arch.st.steal.version & 1)
2309 vcpu->arch.st.steal.version += 1; /* first time write, random junk */
2311 vcpu->arch.st.steal.version += 1;
2313 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2314 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2318 vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2319 vcpu->arch.st.last_steal;
2320 vcpu->arch.st.last_steal = current->sched_info.run_delay;
2322 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2323 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2327 vcpu->arch.st.steal.version += 1;
2329 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2330 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2333 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2336 u32 msr = msr_info->index;
2337 u64 data = msr_info->data;
2340 case MSR_AMD64_NB_CFG:
2341 case MSR_IA32_UCODE_WRITE:
2342 case MSR_VM_HSAVE_PA:
2343 case MSR_AMD64_PATCH_LOADER:
2344 case MSR_AMD64_BU_CFG2:
2345 case MSR_AMD64_DC_CFG:
2348 case MSR_IA32_UCODE_REV:
2349 if (msr_info->host_initiated)
2350 vcpu->arch.microcode_version = data;
2353 return set_efer(vcpu, data);
2355 data &= ~(u64)0x40; /* ignore flush filter disable */
2356 data &= ~(u64)0x100; /* ignore ignne emulation enable */
2357 data &= ~(u64)0x8; /* ignore TLB cache disable */
2358 data &= ~(u64)0x40000; /* ignore Mc status write enable */
2360 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2365 case MSR_FAM10H_MMIO_CONF_BASE:
2367 vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2372 case MSR_IA32_DEBUGCTLMSR:
2374 /* We support the non-activated case already */
2376 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2377 /* Values other than LBR and BTF are vendor-specific,
2378 thus reserved and should throw a #GP */
2381 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2384 case 0x200 ... 0x2ff:
2385 return kvm_mtrr_set_msr(vcpu, msr, data);
2386 case MSR_IA32_APICBASE:
2387 return kvm_set_apic_base(vcpu, msr_info);
2388 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2389 return kvm_x2apic_msr_write(vcpu, msr, data);
2390 case MSR_IA32_TSCDEADLINE:
2391 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2393 case MSR_IA32_TSC_ADJUST:
2394 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2395 if (!msr_info->host_initiated) {
2396 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2397 adjust_tsc_offset_guest(vcpu, adj);
2399 vcpu->arch.ia32_tsc_adjust_msr = data;
2402 case MSR_IA32_MISC_ENABLE:
2403 vcpu->arch.ia32_misc_enable_msr = data;
2405 case MSR_IA32_SMBASE:
2406 if (!msr_info->host_initiated)
2408 vcpu->arch.smbase = data;
2411 kvm_write_tsc(vcpu, msr_info);
2414 if (!msr_info->host_initiated)
2416 vcpu->arch.smi_count = data;
2418 case MSR_KVM_WALL_CLOCK_NEW:
2419 case MSR_KVM_WALL_CLOCK:
2420 vcpu->kvm->arch.wall_clock = data;
2421 kvm_write_wall_clock(vcpu->kvm, data);
2423 case MSR_KVM_SYSTEM_TIME_NEW:
2424 case MSR_KVM_SYSTEM_TIME: {
2425 struct kvm_arch *ka = &vcpu->kvm->arch;
2427 kvmclock_reset(vcpu);
2429 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2430 bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2432 if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2433 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2435 ka->boot_vcpu_runs_old_kvmclock = tmp;
2438 vcpu->arch.time = data;
2439 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2441 /* we verify if the enable bit is set... */
2445 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2446 &vcpu->arch.pv_time, data & ~1ULL,
2447 sizeof(struct pvclock_vcpu_time_info)))
2448 vcpu->arch.pv_time_enabled = false;
2450 vcpu->arch.pv_time_enabled = true;
2454 case MSR_KVM_ASYNC_PF_EN:
2455 if (kvm_pv_enable_async_pf(vcpu, data))
2458 case MSR_KVM_STEAL_TIME:
2460 if (unlikely(!sched_info_on()))
2463 if (data & KVM_STEAL_RESERVED_MASK)
2466 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2467 data & KVM_STEAL_VALID_BITS,
2468 sizeof(struct kvm_steal_time)))
2471 vcpu->arch.st.msr_val = data;
2473 if (!(data & KVM_MSR_ENABLED))
2476 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2479 case MSR_KVM_PV_EOI_EN:
2480 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2484 case MSR_IA32_MCG_CTL:
2485 case MSR_IA32_MCG_STATUS:
2486 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2487 return set_msr_mce(vcpu, msr_info);
2489 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2490 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2491 pr = true; /* fall through */
2492 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2493 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2494 if (kvm_pmu_is_valid_msr(vcpu, msr))
2495 return kvm_pmu_set_msr(vcpu, msr_info);
2497 if (pr || data != 0)
2498 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2499 "0x%x data 0x%llx\n", msr, data);
2501 case MSR_K7_CLK_CTL:
2503 * Ignore all writes to this no longer documented MSR.
2504 * Writes are only relevant for old K7 processors,
2505 * all pre-dating SVM, but a recommended workaround from
2506 * AMD for these chips. It is possible to specify the
2507 * affected processor models on the command line, hence
2508 * the need to ignore the workaround.
2511 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2512 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2513 case HV_X64_MSR_CRASH_CTL:
2514 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2515 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2516 case HV_X64_MSR_TSC_EMULATION_CONTROL:
2517 case HV_X64_MSR_TSC_EMULATION_STATUS:
2518 return kvm_hv_set_msr_common(vcpu, msr, data,
2519 msr_info->host_initiated);
2520 case MSR_IA32_BBL_CR_CTL3:
2521 /* Drop writes to this legacy MSR -- see rdmsr
2522 * counterpart for further detail.
2524 if (report_ignored_msrs)
2525 vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2528 case MSR_AMD64_OSVW_ID_LENGTH:
2529 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2531 vcpu->arch.osvw.length = data;
2533 case MSR_AMD64_OSVW_STATUS:
2534 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2536 vcpu->arch.osvw.status = data;
2538 case MSR_PLATFORM_INFO:
2539 if (!msr_info->host_initiated ||
2540 data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
2541 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2542 cpuid_fault_enabled(vcpu)))
2544 vcpu->arch.msr_platform_info = data;
2546 case MSR_MISC_FEATURES_ENABLES:
2547 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2548 (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2549 !supports_cpuid_fault(vcpu)))
2551 vcpu->arch.msr_misc_features_enables = data;
2554 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2555 return xen_hvm_config(vcpu, data);
2556 if (kvm_pmu_is_valid_msr(vcpu, msr))
2557 return kvm_pmu_set_msr(vcpu, msr_info);
2559 vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2563 if (report_ignored_msrs)
2565 "ignored wrmsr: 0x%x data 0x%llx\n",
2572 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2576 * Reads an msr value (of 'msr_index') into 'pdata'.
2577 * Returns 0 on success, non-0 otherwise.
2578 * Assumes vcpu_load() was already called.
2580 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2582 return kvm_x86_ops->get_msr(vcpu, msr);
2584 EXPORT_SYMBOL_GPL(kvm_get_msr);
2586 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
2589 u64 mcg_cap = vcpu->arch.mcg_cap;
2590 unsigned bank_num = mcg_cap & 0xff;
2593 case MSR_IA32_P5_MC_ADDR:
2594 case MSR_IA32_P5_MC_TYPE:
2597 case MSR_IA32_MCG_CAP:
2598 data = vcpu->arch.mcg_cap;
2600 case MSR_IA32_MCG_CTL:
2601 if (!(mcg_cap & MCG_CTL_P) && !host)
2603 data = vcpu->arch.mcg_ctl;
2605 case MSR_IA32_MCG_STATUS:
2606 data = vcpu->arch.mcg_status;
2609 if (msr >= MSR_IA32_MC0_CTL &&
2610 msr < MSR_IA32_MCx_CTL(bank_num)) {
2611 u32 offset = msr - MSR_IA32_MC0_CTL;
2612 data = vcpu->arch.mce_banks[offset];
2621 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2623 switch (msr_info->index) {
2624 case MSR_IA32_PLATFORM_ID:
2625 case MSR_IA32_EBL_CR_POWERON:
2626 case MSR_IA32_DEBUGCTLMSR:
2627 case MSR_IA32_LASTBRANCHFROMIP:
2628 case MSR_IA32_LASTBRANCHTOIP:
2629 case MSR_IA32_LASTINTFROMIP:
2630 case MSR_IA32_LASTINTTOIP:
2632 case MSR_K8_TSEG_ADDR:
2633 case MSR_K8_TSEG_MASK:
2635 case MSR_VM_HSAVE_PA:
2636 case MSR_K8_INT_PENDING_MSG:
2637 case MSR_AMD64_NB_CFG:
2638 case MSR_FAM10H_MMIO_CONF_BASE:
2639 case MSR_AMD64_BU_CFG2:
2640 case MSR_IA32_PERF_CTL:
2641 case MSR_AMD64_DC_CFG:
2644 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2645 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2646 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2647 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2648 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2649 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2650 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2653 case MSR_IA32_UCODE_REV:
2654 msr_info->data = vcpu->arch.microcode_version;
2657 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2660 case 0x200 ... 0x2ff:
2661 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2662 case 0xcd: /* fsb frequency */
2666 * MSR_EBC_FREQUENCY_ID
2667 * Conservative value valid for even the basic CPU models.
2668 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2669 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2670 * and 266MHz for model 3, or 4. Set Core Clock
2671 * Frequency to System Bus Frequency Ratio to 1 (bits
2672 * 31:24) even though these are only valid for CPU
2673 * models > 2, however guests may end up dividing or
2674 * multiplying by zero otherwise.
2676 case MSR_EBC_FREQUENCY_ID:
2677 msr_info->data = 1 << 24;
2679 case MSR_IA32_APICBASE:
2680 msr_info->data = kvm_get_apic_base(vcpu);
2682 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2683 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2685 case MSR_IA32_TSCDEADLINE:
2686 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2688 case MSR_IA32_TSC_ADJUST:
2689 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2691 case MSR_IA32_MISC_ENABLE:
2692 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2694 case MSR_IA32_SMBASE:
2695 if (!msr_info->host_initiated)
2697 msr_info->data = vcpu->arch.smbase;
2700 msr_info->data = vcpu->arch.smi_count;
2702 case MSR_IA32_PERF_STATUS:
2703 /* TSC increment by tick */
2704 msr_info->data = 1000ULL;
2705 /* CPU multiplier */
2706 msr_info->data |= (((uint64_t)4ULL) << 40);
2709 msr_info->data = vcpu->arch.efer;
2711 case MSR_KVM_WALL_CLOCK:
2712 case MSR_KVM_WALL_CLOCK_NEW:
2713 msr_info->data = vcpu->kvm->arch.wall_clock;
2715 case MSR_KVM_SYSTEM_TIME:
2716 case MSR_KVM_SYSTEM_TIME_NEW:
2717 msr_info->data = vcpu->arch.time;
2719 case MSR_KVM_ASYNC_PF_EN:
2720 msr_info->data = vcpu->arch.apf.msr_val;
2722 case MSR_KVM_STEAL_TIME:
2723 msr_info->data = vcpu->arch.st.msr_val;
2725 case MSR_KVM_PV_EOI_EN:
2726 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2728 case MSR_IA32_P5_MC_ADDR:
2729 case MSR_IA32_P5_MC_TYPE:
2730 case MSR_IA32_MCG_CAP:
2731 case MSR_IA32_MCG_CTL:
2732 case MSR_IA32_MCG_STATUS:
2733 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2734 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2735 msr_info->host_initiated);
2736 case MSR_K7_CLK_CTL:
2738 * Provide expected ramp-up count for K7. All other
2739 * are set to zero, indicating minimum divisors for
2742 * This prevents guest kernels on AMD host with CPU
2743 * type 6, model 8 and higher from exploding due to
2744 * the rdmsr failing.
2746 msr_info->data = 0x20000000;
2748 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2749 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2750 case HV_X64_MSR_CRASH_CTL:
2751 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2752 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2753 case HV_X64_MSR_TSC_EMULATION_CONTROL:
2754 case HV_X64_MSR_TSC_EMULATION_STATUS:
2755 return kvm_hv_get_msr_common(vcpu,
2756 msr_info->index, &msr_info->data,
2757 msr_info->host_initiated);
2759 case MSR_IA32_BBL_CR_CTL3:
2760 /* This legacy MSR exists but isn't fully documented in current
2761 * silicon. It is however accessed by winxp in very narrow
2762 * scenarios where it sets bit #19, itself documented as
2763 * a "reserved" bit. Best effort attempt to source coherent
2764 * read data here should the balance of the register be
2765 * interpreted by the guest:
2767 * L2 cache control register 3: 64GB range, 256KB size,
2768 * enabled, latency 0x1, configured
2770 msr_info->data = 0xbe702111;
2772 case MSR_AMD64_OSVW_ID_LENGTH:
2773 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2775 msr_info->data = vcpu->arch.osvw.length;
2777 case MSR_AMD64_OSVW_STATUS:
2778 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2780 msr_info->data = vcpu->arch.osvw.status;
2782 case MSR_PLATFORM_INFO:
2783 msr_info->data = vcpu->arch.msr_platform_info;
2785 case MSR_MISC_FEATURES_ENABLES:
2786 msr_info->data = vcpu->arch.msr_misc_features_enables;
2789 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2790 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2792 vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2796 if (report_ignored_msrs)
2797 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2805 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2808 * Read or write a bunch of msrs. All parameters are kernel addresses.
2810 * @return number of msrs set successfully.
2812 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2813 struct kvm_msr_entry *entries,
2814 int (*do_msr)(struct kvm_vcpu *vcpu,
2815 unsigned index, u64 *data))
2819 for (i = 0; i < msrs->nmsrs; ++i)
2820 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2827 * Read or write a bunch of msrs. Parameters are user addresses.
2829 * @return number of msrs set successfully.
2831 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2832 int (*do_msr)(struct kvm_vcpu *vcpu,
2833 unsigned index, u64 *data),
2836 struct kvm_msrs msrs;
2837 struct kvm_msr_entry *entries;
2842 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2846 if (msrs.nmsrs >= MAX_IO_MSRS)
2849 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2850 entries = memdup_user(user_msrs->entries, size);
2851 if (IS_ERR(entries)) {
2852 r = PTR_ERR(entries);
2856 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2861 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2872 static inline bool kvm_can_mwait_in_guest(void)
2874 return boot_cpu_has(X86_FEATURE_MWAIT) &&
2875 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
2876 boot_cpu_has(X86_FEATURE_ARAT);
2879 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2884 case KVM_CAP_IRQCHIP:
2886 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2887 case KVM_CAP_SET_TSS_ADDR:
2888 case KVM_CAP_EXT_CPUID:
2889 case KVM_CAP_EXT_EMUL_CPUID:
2890 case KVM_CAP_CLOCKSOURCE:
2892 case KVM_CAP_NOP_IO_DELAY:
2893 case KVM_CAP_MP_STATE:
2894 case KVM_CAP_SYNC_MMU:
2895 case KVM_CAP_USER_NMI:
2896 case KVM_CAP_REINJECT_CONTROL:
2897 case KVM_CAP_IRQ_INJECT_STATUS:
2898 case KVM_CAP_IOEVENTFD:
2899 case KVM_CAP_IOEVENTFD_NO_LENGTH:
2901 case KVM_CAP_PIT_STATE2:
2902 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2903 case KVM_CAP_XEN_HVM:
2904 case KVM_CAP_VCPU_EVENTS:
2905 case KVM_CAP_HYPERV:
2906 case KVM_CAP_HYPERV_VAPIC:
2907 case KVM_CAP_HYPERV_SPIN:
2908 case KVM_CAP_HYPERV_SYNIC:
2909 case KVM_CAP_HYPERV_SYNIC2:
2910 case KVM_CAP_HYPERV_VP_INDEX:
2911 case KVM_CAP_HYPERV_EVENTFD:
2912 case KVM_CAP_HYPERV_TLBFLUSH:
2913 case KVM_CAP_PCI_SEGMENT:
2914 case KVM_CAP_DEBUGREGS:
2915 case KVM_CAP_X86_ROBUST_SINGLESTEP:
2917 case KVM_CAP_ASYNC_PF:
2918 case KVM_CAP_GET_TSC_KHZ:
2919 case KVM_CAP_KVMCLOCK_CTRL:
2920 case KVM_CAP_READONLY_MEM:
2921 case KVM_CAP_HYPERV_TIME:
2922 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2923 case KVM_CAP_TSC_DEADLINE_TIMER:
2924 case KVM_CAP_ENABLE_CAP_VM:
2925 case KVM_CAP_DISABLE_QUIRKS:
2926 case KVM_CAP_SET_BOOT_CPU_ID:
2927 case KVM_CAP_SPLIT_IRQCHIP:
2928 case KVM_CAP_IMMEDIATE_EXIT:
2929 case KVM_CAP_GET_MSR_FEATURES:
2932 case KVM_CAP_SYNC_REGS:
2933 r = KVM_SYNC_X86_VALID_FIELDS;
2935 case KVM_CAP_ADJUST_CLOCK:
2936 r = KVM_CLOCK_TSC_STABLE;
2938 case KVM_CAP_X86_DISABLE_EXITS:
2939 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
2940 if(kvm_can_mwait_in_guest())
2941 r |= KVM_X86_DISABLE_EXITS_MWAIT;
2943 case KVM_CAP_X86_SMM:
2944 /* SMBASE is usually relocated above 1M on modern chipsets,
2945 * and SMM handlers might indeed rely on 4G segment limits,
2946 * so do not report SMM to be available if real mode is
2947 * emulated via vm86 mode. Still, do not go to great lengths
2948 * to avoid userspace's usage of the feature, because it is a
2949 * fringe case that is not enabled except via specific settings
2950 * of the module parameters.
2952 r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
2955 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2957 case KVM_CAP_NR_VCPUS:
2958 r = KVM_SOFT_MAX_VCPUS;
2960 case KVM_CAP_MAX_VCPUS:
2963 case KVM_CAP_NR_MEMSLOTS:
2964 r = KVM_USER_MEM_SLOTS;
2966 case KVM_CAP_PV_MMU: /* obsolete */
2970 r = KVM_MAX_MCE_BANKS;
2973 r = boot_cpu_has(X86_FEATURE_XSAVE);
2975 case KVM_CAP_TSC_CONTROL:
2976 r = kvm_has_tsc_control;
2978 case KVM_CAP_X2APIC_API:
2979 r = KVM_X2APIC_API_VALID_FLAGS;
2981 case KVM_CAP_NESTED_STATE:
2982 r = kvm_x86_ops->get_nested_state ?
2983 kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
2992 long kvm_arch_dev_ioctl(struct file *filp,
2993 unsigned int ioctl, unsigned long arg)
2995 void __user *argp = (void __user *)arg;
2999 case KVM_GET_MSR_INDEX_LIST: {
3000 struct kvm_msr_list __user *user_msr_list = argp;
3001 struct kvm_msr_list msr_list;
3005 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3008 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3009 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3012 if (n < msr_list.nmsrs)
3015 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3016 num_msrs_to_save * sizeof(u32)))
3018 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3020 num_emulated_msrs * sizeof(u32)))
3025 case KVM_GET_SUPPORTED_CPUID:
3026 case KVM_GET_EMULATED_CPUID: {
3027 struct kvm_cpuid2 __user *cpuid_arg = argp;
3028 struct kvm_cpuid2 cpuid;
3031 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3034 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3040 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3045 case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3047 if (copy_to_user(argp, &kvm_mce_cap_supported,
3048 sizeof(kvm_mce_cap_supported)))
3052 case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3053 struct kvm_msr_list __user *user_msr_list = argp;
3054 struct kvm_msr_list msr_list;
3058 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3061 msr_list.nmsrs = num_msr_based_features;
3062 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3065 if (n < msr_list.nmsrs)
3068 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3069 num_msr_based_features * sizeof(u32)))
3075 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3085 static void wbinvd_ipi(void *garbage)
3090 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3092 return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3095 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3097 /* Address WBINVD may be executed by guest */
3098 if (need_emulate_wbinvd(vcpu)) {
3099 if (kvm_x86_ops->has_wbinvd_exit())
3100 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3101 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3102 smp_call_function_single(vcpu->cpu,
3103 wbinvd_ipi, NULL, 1);
3106 kvm_x86_ops->vcpu_load(vcpu, cpu);
3108 /* Apply any externally detected TSC adjustments (due to suspend) */
3109 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3110 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3111 vcpu->arch.tsc_offset_adjustment = 0;
3112 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3115 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3116 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3117 rdtsc() - vcpu->arch.last_host_tsc;
3119 mark_tsc_unstable("KVM discovered backwards TSC");
3121 if (kvm_check_tsc_unstable()) {
3122 u64 offset = kvm_compute_tsc_offset(vcpu,
3123 vcpu->arch.last_guest_tsc);
3124 kvm_vcpu_write_tsc_offset(vcpu, offset);
3125 vcpu->arch.tsc_catchup = 1;
3128 if (kvm_lapic_hv_timer_in_use(vcpu))
3129 kvm_lapic_restart_hv_timer(vcpu);
3132 * On a host with synchronized TSC, there is no need to update
3133 * kvmclock on vcpu->cpu migration
3135 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3136 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3137 if (vcpu->cpu != cpu)
3138 kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3142 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3145 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3147 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3150 vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3152 kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3153 &vcpu->arch.st.steal.preempted,
3154 offsetof(struct kvm_steal_time, preempted),
3155 sizeof(vcpu->arch.st.steal.preempted));
3158 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3162 if (vcpu->preempted)
3163 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3166 * Disable page faults because we're in atomic context here.
3167 * kvm_write_guest_offset_cached() would call might_fault()
3168 * that relies on pagefault_disable() to tell if there's a
3169 * bug. NOTE: the write to guest memory may not go through if
3170 * during postcopy live migration or if there's heavy guest
3173 pagefault_disable();
3175 * kvm_memslots() will be called by
3176 * kvm_write_guest_offset_cached() so take the srcu lock.
3178 idx = srcu_read_lock(&vcpu->kvm->srcu);
3179 kvm_steal_time_set_preempted(vcpu);
3180 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3182 kvm_x86_ops->vcpu_put(vcpu);
3183 vcpu->arch.last_host_tsc = rdtsc();
3185 * If userspace has set any breakpoints or watchpoints, dr6 is restored
3186 * on every vmexit, but if not, we might have a stale dr6 from the
3187 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3192 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3193 struct kvm_lapic_state *s)
3195 if (vcpu->arch.apicv_active)
3196 kvm_x86_ops->sync_pir_to_irr(vcpu);
3198 return kvm_apic_get_state(vcpu, s);
3201 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3202 struct kvm_lapic_state *s)
3206 r = kvm_apic_set_state(vcpu, s);
3209 update_cr8_intercept(vcpu);
3214 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3216 return (!lapic_in_kernel(vcpu) ||
3217 kvm_apic_accept_pic_intr(vcpu));
3221 * if userspace requested an interrupt window, check that the
3222 * interrupt window is open.
3224 * No need to exit to userspace if we already have an interrupt queued.
3226 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3228 return kvm_arch_interrupt_allowed(vcpu) &&
3229 !kvm_cpu_has_interrupt(vcpu) &&
3230 !kvm_event_needs_reinjection(vcpu) &&
3231 kvm_cpu_accept_dm_intr(vcpu);
3234 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3235 struct kvm_interrupt *irq)
3237 if (irq->irq >= KVM_NR_INTERRUPTS)
3240 if (!irqchip_in_kernel(vcpu->kvm)) {
3241 kvm_queue_interrupt(vcpu, irq->irq, false);
3242 kvm_make_request(KVM_REQ_EVENT, vcpu);
3247 * With in-kernel LAPIC, we only use this to inject EXTINT, so
3248 * fail for in-kernel 8259.
3250 if (pic_in_kernel(vcpu->kvm))
3253 if (vcpu->arch.pending_external_vector != -1)
3256 vcpu->arch.pending_external_vector = irq->irq;
3257 kvm_make_request(KVM_REQ_EVENT, vcpu);
3261 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3263 kvm_inject_nmi(vcpu);
3268 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3270 kvm_make_request(KVM_REQ_SMI, vcpu);
3275 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3276 struct kvm_tpr_access_ctl *tac)
3280 vcpu->arch.tpr_access_reporting = !!tac->enabled;
3284 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3288 unsigned bank_num = mcg_cap & 0xff, bank;
3291 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3293 if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3296 vcpu->arch.mcg_cap = mcg_cap;
3297 /* Init IA32_MCG_CTL to all 1s */
3298 if (mcg_cap & MCG_CTL_P)
3299 vcpu->arch.mcg_ctl = ~(u64)0;
3300 /* Init IA32_MCi_CTL to all 1s */
3301 for (bank = 0; bank < bank_num; bank++)
3302 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3304 if (kvm_x86_ops->setup_mce)
3305 kvm_x86_ops->setup_mce(vcpu);
3310 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3311 struct kvm_x86_mce *mce)
3313 u64 mcg_cap = vcpu->arch.mcg_cap;
3314 unsigned bank_num = mcg_cap & 0xff;
3315 u64 *banks = vcpu->arch.mce_banks;
3317 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3320 * if IA32_MCG_CTL is not all 1s, the uncorrected error
3321 * reporting is disabled
3323 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3324 vcpu->arch.mcg_ctl != ~(u64)0)
3326 banks += 4 * mce->bank;
3328 * if IA32_MCi_CTL is not all 1s, the uncorrected error
3329 * reporting is disabled for the bank
3331 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3333 if (mce->status & MCI_STATUS_UC) {
3334 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3335 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3336 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3339 if (banks[1] & MCI_STATUS_VAL)
3340 mce->status |= MCI_STATUS_OVER;
3341 banks[2] = mce->addr;
3342 banks[3] = mce->misc;
3343 vcpu->arch.mcg_status = mce->mcg_status;
3344 banks[1] = mce->status;
3345 kvm_queue_exception(vcpu, MC_VECTOR);
3346 } else if (!(banks[1] & MCI_STATUS_VAL)
3347 || !(banks[1] & MCI_STATUS_UC)) {
3348 if (banks[1] & MCI_STATUS_VAL)
3349 mce->status |= MCI_STATUS_OVER;
3350 banks[2] = mce->addr;
3351 banks[3] = mce->misc;
3352 banks[1] = mce->status;
3354 banks[1] |= MCI_STATUS_OVER;
3358 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3359 struct kvm_vcpu_events *events)
3363 * FIXME: pass injected and pending separately. This is only
3364 * needed for nested virtualization, whose state cannot be
3365 * migrated yet. For now we can combine them.
3367 events->exception.injected =
3368 (vcpu->arch.exception.pending ||
3369 vcpu->arch.exception.injected) &&
3370 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3371 events->exception.nr = vcpu->arch.exception.nr;
3372 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3373 events->exception.pad = 0;
3374 events->exception.error_code = vcpu->arch.exception.error_code;
3376 events->interrupt.injected =
3377 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3378 events->interrupt.nr = vcpu->arch.interrupt.nr;
3379 events->interrupt.soft = 0;
3380 events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3382 events->nmi.injected = vcpu->arch.nmi_injected;
3383 events->nmi.pending = vcpu->arch.nmi_pending != 0;
3384 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3385 events->nmi.pad = 0;
3387 events->sipi_vector = 0; /* never valid when reporting to user space */
3389 events->smi.smm = is_smm(vcpu);
3390 events->smi.pending = vcpu->arch.smi_pending;
3391 events->smi.smm_inside_nmi =
3392 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3393 events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3395 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3396 | KVM_VCPUEVENT_VALID_SHADOW
3397 | KVM_VCPUEVENT_VALID_SMM);
3398 memset(&events->reserved, 0, sizeof(events->reserved));
3401 static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
3403 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3404 struct kvm_vcpu_events *events)
3406 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3407 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3408 | KVM_VCPUEVENT_VALID_SHADOW
3409 | KVM_VCPUEVENT_VALID_SMM))
3412 if (events->exception.injected &&
3413 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3414 is_guest_mode(vcpu)))
3417 /* INITs are latched while in SMM */
3418 if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3419 (events->smi.smm || events->smi.pending) &&
3420 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3424 vcpu->arch.exception.injected = false;
3425 vcpu->arch.exception.pending = events->exception.injected;
3426 vcpu->arch.exception.nr = events->exception.nr;
3427 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3428 vcpu->arch.exception.error_code = events->exception.error_code;
3430 vcpu->arch.interrupt.injected = events->interrupt.injected;
3431 vcpu->arch.interrupt.nr = events->interrupt.nr;
3432 vcpu->arch.interrupt.soft = events->interrupt.soft;
3433 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3434 kvm_x86_ops->set_interrupt_shadow(vcpu,
3435 events->interrupt.shadow);
3437 vcpu->arch.nmi_injected = events->nmi.injected;
3438 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3439 vcpu->arch.nmi_pending = events->nmi.pending;
3440 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3442 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3443 lapic_in_kernel(vcpu))
3444 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3446 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3447 u32 hflags = vcpu->arch.hflags;
3448 if (events->smi.smm)
3449 hflags |= HF_SMM_MASK;
3451 hflags &= ~HF_SMM_MASK;
3452 kvm_set_hflags(vcpu, hflags);
3454 vcpu->arch.smi_pending = events->smi.pending;
3456 if (events->smi.smm) {
3457 if (events->smi.smm_inside_nmi)
3458 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3460 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3461 if (lapic_in_kernel(vcpu)) {
3462 if (events->smi.latched_init)
3463 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3465 clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3470 kvm_make_request(KVM_REQ_EVENT, vcpu);
3475 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3476 struct kvm_debugregs *dbgregs)
3480 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3481 kvm_get_dr(vcpu, 6, &val);
3483 dbgregs->dr7 = vcpu->arch.dr7;
3485 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3488 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3489 struct kvm_debugregs *dbgregs)
3494 if (dbgregs->dr6 & ~0xffffffffull)
3496 if (dbgregs->dr7 & ~0xffffffffull)