Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[muen/linux.git] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Amit Shah    <amit.shah@qumranet.com>
15  *   Ben-Ami Yassour <benami@il.ibm.com>
16  *
17  * This work is licensed under the terms of the GNU GPL, version 2.  See
18  * the COPYING file in the top-level directory.
19  *
20  */
21
22 #include <linux/kvm_host.h>
23 #include "irq.h"
24 #include "mmu.h"
25 #include "i8254.h"
26 #include "tss.h"
27 #include "kvm_cache_regs.h"
28 #include "x86.h"
29 #include "cpuid.h"
30 #include "pmu.h"
31 #include "hyperv.h"
32
33 #include <linux/clocksource.h>
34 #include <linux/interrupt.h>
35 #include <linux/kvm.h>
36 #include <linux/fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/export.h>
39 #include <linux/moduleparam.h>
40 #include <linux/mman.h>
41 #include <linux/highmem.h>
42 #include <linux/iommu.h>
43 #include <linux/intel-iommu.h>
44 #include <linux/cpufreq.h>
45 #include <linux/user-return-notifier.h>
46 #include <linux/srcu.h>
47 #include <linux/slab.h>
48 #include <linux/perf_event.h>
49 #include <linux/uaccess.h>
50 #include <linux/hash.h>
51 #include <linux/pci.h>
52 #include <linux/timekeeper_internal.h>
53 #include <linux/pvclock_gtod.h>
54 #include <linux/kvm_irqfd.h>
55 #include <linux/irqbypass.h>
56 #include <linux/sched/stat.h>
57 #include <linux/mem_encrypt.h>
58
59 #include <trace/events/kvm.h>
60
61 #include <asm/debugreg.h>
62 #include <asm/msr.h>
63 #include <asm/desc.h>
64 #include <asm/mce.h>
65 #include <linux/kernel_stat.h>
66 #include <asm/fpu/internal.h> /* Ugh! */
67 #include <asm/pvclock.h>
68 #include <asm/div64.h>
69 #include <asm/irq_remapping.h>
70 #include <asm/mshyperv.h>
71 #include <asm/hypervisor.h>
72
73 #define CREATE_TRACE_POINTS
74 #include "trace.h"
75
76 #define MAX_IO_MSRS 256
77 #define KVM_MAX_MCE_BANKS 32
78 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
79 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
80
81 #define emul_to_vcpu(ctxt) \
82         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
83
84 /* EFER defaults:
85  * - enable syscall per default because its emulated by KVM
86  * - enable LME and LMA per default on 64 bit KVM
87  */
88 #ifdef CONFIG_X86_64
89 static
90 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
91 #else
92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
93 #endif
94
95 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
96 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
97
98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
99                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
100
101 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102 static void process_nmi(struct kvm_vcpu *vcpu);
103 static void enter_smm(struct kvm_vcpu *vcpu);
104 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105 static void store_regs(struct kvm_vcpu *vcpu);
106 static int sync_regs(struct kvm_vcpu *vcpu);
107
108 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
109 EXPORT_SYMBOL_GPL(kvm_x86_ops);
110
111 static bool __read_mostly ignore_msrs = 0;
112 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
113
114 static bool __read_mostly report_ignored_msrs = true;
115 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
116
117 unsigned int min_timer_period_us = 200;
118 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
119
120 static bool __read_mostly kvmclock_periodic_sync = true;
121 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
122
123 bool __read_mostly kvm_has_tsc_control;
124 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
125 u32  __read_mostly kvm_max_guest_tsc_khz;
126 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
127 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
128 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
129 u64  __read_mostly kvm_max_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
131 u64 __read_mostly kvm_default_tsc_scaling_ratio;
132 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
133
134 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
135 static u32 __read_mostly tsc_tolerance_ppm = 250;
136 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
137
138 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
139 unsigned int __read_mostly lapic_timer_advance_ns = 0;
140 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
141 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
142
143 static bool __read_mostly vector_hashing = true;
144 module_param(vector_hashing, bool, S_IRUGO);
145
146 bool __read_mostly enable_vmware_backdoor = false;
147 module_param(enable_vmware_backdoor, bool, S_IRUGO);
148 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
149
150 static bool __read_mostly force_emulation_prefix = false;
151 module_param(force_emulation_prefix, bool, S_IRUGO);
152
153 #define KVM_NR_SHARED_MSRS 16
154
155 struct kvm_shared_msrs_global {
156         int nr;
157         u32 msrs[KVM_NR_SHARED_MSRS];
158 };
159
160 struct kvm_shared_msrs {
161         struct user_return_notifier urn;
162         bool registered;
163         struct kvm_shared_msr_values {
164                 u64 host;
165                 u64 curr;
166         } values[KVM_NR_SHARED_MSRS];
167 };
168
169 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
170 static struct kvm_shared_msrs __percpu *shared_msrs;
171
172 struct kvm_stats_debugfs_item debugfs_entries[] = {
173         { "pf_fixed", VCPU_STAT(pf_fixed) },
174         { "pf_guest", VCPU_STAT(pf_guest) },
175         { "tlb_flush", VCPU_STAT(tlb_flush) },
176         { "invlpg", VCPU_STAT(invlpg) },
177         { "exits", VCPU_STAT(exits) },
178         { "io_exits", VCPU_STAT(io_exits) },
179         { "mmio_exits", VCPU_STAT(mmio_exits) },
180         { "signal_exits", VCPU_STAT(signal_exits) },
181         { "irq_window", VCPU_STAT(irq_window_exits) },
182         { "nmi_window", VCPU_STAT(nmi_window_exits) },
183         { "halt_exits", VCPU_STAT(halt_exits) },
184         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
185         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
186         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
187         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
188         { "hypercalls", VCPU_STAT(hypercalls) },
189         { "request_irq", VCPU_STAT(request_irq_exits) },
190         { "irq_exits", VCPU_STAT(irq_exits) },
191         { "host_state_reload", VCPU_STAT(host_state_reload) },
192         { "fpu_reload", VCPU_STAT(fpu_reload) },
193         { "insn_emulation", VCPU_STAT(insn_emulation) },
194         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
195         { "irq_injections", VCPU_STAT(irq_injections) },
196         { "nmi_injections", VCPU_STAT(nmi_injections) },
197         { "req_event", VCPU_STAT(req_event) },
198         { "l1d_flush", VCPU_STAT(l1d_flush) },
199         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
200         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
201         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
202         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
203         { "mmu_flooded", VM_STAT(mmu_flooded) },
204         { "mmu_recycled", VM_STAT(mmu_recycled) },
205         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
206         { "mmu_unsync", VM_STAT(mmu_unsync) },
207         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
208         { "largepages", VM_STAT(lpages) },
209         { "max_mmu_page_hash_collisions",
210                 VM_STAT(max_mmu_page_hash_collisions) },
211         { NULL }
212 };
213
214 u64 __read_mostly host_xcr0;
215
216 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
217
218 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
219 {
220         int i;
221         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
222                 vcpu->arch.apf.gfns[i] = ~0;
223 }
224
225 static void kvm_on_user_return(struct user_return_notifier *urn)
226 {
227         unsigned slot;
228         struct kvm_shared_msrs *locals
229                 = container_of(urn, struct kvm_shared_msrs, urn);
230         struct kvm_shared_msr_values *values;
231         unsigned long flags;
232
233         /*
234          * Disabling irqs at this point since the following code could be
235          * interrupted and executed through kvm_arch_hardware_disable()
236          */
237         local_irq_save(flags);
238         if (locals->registered) {
239                 locals->registered = false;
240                 user_return_notifier_unregister(urn);
241         }
242         local_irq_restore(flags);
243         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
244                 values = &locals->values[slot];
245                 if (values->host != values->curr) {
246                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
247                         values->curr = values->host;
248                 }
249         }
250 }
251
252 static void shared_msr_update(unsigned slot, u32 msr)
253 {
254         u64 value;
255         unsigned int cpu = smp_processor_id();
256         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
257
258         /* only read, and nobody should modify it at this time,
259          * so don't need lock */
260         if (slot >= shared_msrs_global.nr) {
261                 printk(KERN_ERR "kvm: invalid MSR slot!");
262                 return;
263         }
264         rdmsrl_safe(msr, &value);
265         smsr->values[slot].host = value;
266         smsr->values[slot].curr = value;
267 }
268
269 void kvm_define_shared_msr(unsigned slot, u32 msr)
270 {
271         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
272         shared_msrs_global.msrs[slot] = msr;
273         if (slot >= shared_msrs_global.nr)
274                 shared_msrs_global.nr = slot + 1;
275 }
276 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
277
278 static void kvm_shared_msr_cpu_online(void)
279 {
280         unsigned i;
281
282         for (i = 0; i < shared_msrs_global.nr; ++i)
283                 shared_msr_update(i, shared_msrs_global.msrs[i]);
284 }
285
286 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
287 {
288         unsigned int cpu = smp_processor_id();
289         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
290         int err;
291
292         if (((value ^ smsr->values[slot].curr) & mask) == 0)
293                 return 0;
294         smsr->values[slot].curr = value;
295         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
296         if (err)
297                 return 1;
298
299         if (!smsr->registered) {
300                 smsr->urn.on_user_return = kvm_on_user_return;
301                 user_return_notifier_register(&smsr->urn);
302                 smsr->registered = true;
303         }
304         return 0;
305 }
306 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
307
308 static void drop_user_return_notifiers(void)
309 {
310         unsigned int cpu = smp_processor_id();
311         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
312
313         if (smsr->registered)
314                 kvm_on_user_return(&smsr->urn);
315 }
316
317 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
318 {
319         return vcpu->arch.apic_base;
320 }
321 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
322
323 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
324 {
325         return kvm_apic_mode(kvm_get_apic_base(vcpu));
326 }
327 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
328
329 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
330 {
331         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
332         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
333         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
334                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
335
336         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
337                 return 1;
338         if (!msr_info->host_initiated) {
339                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
340                         return 1;
341                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
342                         return 1;
343         }
344
345         kvm_lapic_set_base(vcpu, msr_info->data);
346         return 0;
347 }
348 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
349
350 asmlinkage __visible void kvm_spurious_fault(void)
351 {
352         /* Fault while not rebooting.  We want the trace. */
353         BUG();
354 }
355 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
356
357 #define EXCPT_BENIGN            0
358 #define EXCPT_CONTRIBUTORY      1
359 #define EXCPT_PF                2
360
361 static int exception_class(int vector)
362 {
363         switch (vector) {
364         case PF_VECTOR:
365                 return EXCPT_PF;
366         case DE_VECTOR:
367         case TS_VECTOR:
368         case NP_VECTOR:
369         case SS_VECTOR:
370         case GP_VECTOR:
371                 return EXCPT_CONTRIBUTORY;
372         default:
373                 break;
374         }
375         return EXCPT_BENIGN;
376 }
377
378 #define EXCPT_FAULT             0
379 #define EXCPT_TRAP              1
380 #define EXCPT_ABORT             2
381 #define EXCPT_INTERRUPT         3
382
383 static int exception_type(int vector)
384 {
385         unsigned int mask;
386
387         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
388                 return EXCPT_INTERRUPT;
389
390         mask = 1 << vector;
391
392         /* #DB is trap, as instruction watchpoints are handled elsewhere */
393         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
394                 return EXCPT_TRAP;
395
396         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
397                 return EXCPT_ABORT;
398
399         /* Reserved exceptions will result in fault */
400         return EXCPT_FAULT;
401 }
402
403 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
404                 unsigned nr, bool has_error, u32 error_code,
405                 bool reinject)
406 {
407         u32 prev_nr;
408         int class1, class2;
409
410         kvm_make_request(KVM_REQ_EVENT, vcpu);
411
412         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
413         queue:
414                 if (has_error && !is_protmode(vcpu))
415                         has_error = false;
416                 if (reinject) {
417                         /*
418                          * On vmentry, vcpu->arch.exception.pending is only
419                          * true if an event injection was blocked by
420                          * nested_run_pending.  In that case, however,
421                          * vcpu_enter_guest requests an immediate exit,
422                          * and the guest shouldn't proceed far enough to
423                          * need reinjection.
424                          */
425                         WARN_ON_ONCE(vcpu->arch.exception.pending);
426                         vcpu->arch.exception.injected = true;
427                 } else {
428                         vcpu->arch.exception.pending = true;
429                         vcpu->arch.exception.injected = false;
430                 }
431                 vcpu->arch.exception.has_error_code = has_error;
432                 vcpu->arch.exception.nr = nr;
433                 vcpu->arch.exception.error_code = error_code;
434                 return;
435         }
436
437         /* to check exception */
438         prev_nr = vcpu->arch.exception.nr;
439         if (prev_nr == DF_VECTOR) {
440                 /* triple fault -> shutdown */
441                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
442                 return;
443         }
444         class1 = exception_class(prev_nr);
445         class2 = exception_class(nr);
446         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
447                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
448                 /*
449                  * Generate double fault per SDM Table 5-5.  Set
450                  * exception.pending = true so that the double fault
451                  * can trigger a nested vmexit.
452                  */
453                 vcpu->arch.exception.pending = true;
454                 vcpu->arch.exception.injected = false;
455                 vcpu->arch.exception.has_error_code = true;
456                 vcpu->arch.exception.nr = DF_VECTOR;
457                 vcpu->arch.exception.error_code = 0;
458         } else
459                 /* replace previous exception with a new one in a hope
460                    that instruction re-execution will regenerate lost
461                    exception */
462                 goto queue;
463 }
464
465 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
466 {
467         kvm_multiple_exception(vcpu, nr, false, 0, false);
468 }
469 EXPORT_SYMBOL_GPL(kvm_queue_exception);
470
471 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
472 {
473         kvm_multiple_exception(vcpu, nr, false, 0, true);
474 }
475 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
476
477 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
478 {
479         if (err)
480                 kvm_inject_gp(vcpu, 0);
481         else
482                 return kvm_skip_emulated_instruction(vcpu);
483
484         return 1;
485 }
486 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
487
488 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
489 {
490         ++vcpu->stat.pf_guest;
491         vcpu->arch.exception.nested_apf =
492                 is_guest_mode(vcpu) && fault->async_page_fault;
493         if (vcpu->arch.exception.nested_apf)
494                 vcpu->arch.apf.nested_apf_token = fault->address;
495         else
496                 vcpu->arch.cr2 = fault->address;
497         kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
498 }
499 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
500
501 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
502 {
503         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
504                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
505         else
506                 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
507
508         return fault->nested_page_fault;
509 }
510
511 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
512 {
513         atomic_inc(&vcpu->arch.nmi_queued);
514         kvm_make_request(KVM_REQ_NMI, vcpu);
515 }
516 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
517
518 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
519 {
520         kvm_multiple_exception(vcpu, nr, true, error_code, false);
521 }
522 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
523
524 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
525 {
526         kvm_multiple_exception(vcpu, nr, true, error_code, true);
527 }
528 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
529
530 /*
531  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
532  * a #GP and return false.
533  */
534 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
535 {
536         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
537                 return true;
538         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
539         return false;
540 }
541 EXPORT_SYMBOL_GPL(kvm_require_cpl);
542
543 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
544 {
545         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
546                 return true;
547
548         kvm_queue_exception(vcpu, UD_VECTOR);
549         return false;
550 }
551 EXPORT_SYMBOL_GPL(kvm_require_dr);
552
553 /*
554  * This function will be used to read from the physical memory of the currently
555  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
556  * can read from guest physical or from the guest's guest physical memory.
557  */
558 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
559                             gfn_t ngfn, void *data, int offset, int len,
560                             u32 access)
561 {
562         struct x86_exception exception;
563         gfn_t real_gfn;
564         gpa_t ngpa;
565
566         ngpa     = gfn_to_gpa(ngfn);
567         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
568         if (real_gfn == UNMAPPED_GVA)
569                 return -EFAULT;
570
571         real_gfn = gpa_to_gfn(real_gfn);
572
573         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
574 }
575 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
576
577 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
578                                void *data, int offset, int len, u32 access)
579 {
580         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
581                                        data, offset, len, access);
582 }
583
584 /*
585  * Load the pae pdptrs.  Return true is they are all valid.
586  */
587 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
588 {
589         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
590         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
591         int i;
592         int ret;
593         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
594
595         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
596                                       offset * sizeof(u64), sizeof(pdpte),
597                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
598         if (ret < 0) {
599                 ret = 0;
600                 goto out;
601         }
602         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
603                 if ((pdpte[i] & PT_PRESENT_MASK) &&
604                     (pdpte[i] &
605                      vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
606                         ret = 0;
607                         goto out;
608                 }
609         }
610         ret = 1;
611
612         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
613         __set_bit(VCPU_EXREG_PDPTR,
614                   (unsigned long *)&vcpu->arch.regs_avail);
615         __set_bit(VCPU_EXREG_PDPTR,
616                   (unsigned long *)&vcpu->arch.regs_dirty);
617 out:
618
619         return ret;
620 }
621 EXPORT_SYMBOL_GPL(load_pdptrs);
622
623 bool pdptrs_changed(struct kvm_vcpu *vcpu)
624 {
625         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
626         bool changed = true;
627         int offset;
628         gfn_t gfn;
629         int r;
630
631         if (is_long_mode(vcpu) || !is_pae(vcpu))
632                 return false;
633
634         if (!test_bit(VCPU_EXREG_PDPTR,
635                       (unsigned long *)&vcpu->arch.regs_avail))
636                 return true;
637
638         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
639         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
640         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
641                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
642         if (r < 0)
643                 goto out;
644         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
645 out:
646
647         return changed;
648 }
649 EXPORT_SYMBOL_GPL(pdptrs_changed);
650
651 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
652 {
653         unsigned long old_cr0 = kvm_read_cr0(vcpu);
654         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
655
656         cr0 |= X86_CR0_ET;
657
658 #ifdef CONFIG_X86_64
659         if (cr0 & 0xffffffff00000000UL)
660                 return 1;
661 #endif
662
663         cr0 &= ~CR0_RESERVED_BITS;
664
665         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
666                 return 1;
667
668         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
669                 return 1;
670
671         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
672 #ifdef CONFIG_X86_64
673                 if ((vcpu->arch.efer & EFER_LME)) {
674                         int cs_db, cs_l;
675
676                         if (!is_pae(vcpu))
677                                 return 1;
678                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
679                         if (cs_l)
680                                 return 1;
681                 } else
682 #endif
683                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
684                                                  kvm_read_cr3(vcpu)))
685                         return 1;
686         }
687
688         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
689                 return 1;
690
691         kvm_x86_ops->set_cr0(vcpu, cr0);
692
693         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
694                 kvm_clear_async_pf_completion_queue(vcpu);
695                 kvm_async_pf_hash_reset(vcpu);
696         }
697
698         if ((cr0 ^ old_cr0) & update_bits)
699                 kvm_mmu_reset_context(vcpu);
700
701         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
702             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
703             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
704                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
705
706         return 0;
707 }
708 EXPORT_SYMBOL_GPL(kvm_set_cr0);
709
710 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
711 {
712         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
713 }
714 EXPORT_SYMBOL_GPL(kvm_lmsw);
715
716 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
717 {
718         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
719                         !vcpu->guest_xcr0_loaded) {
720                 /* kvm_set_xcr() also depends on this */
721                 if (vcpu->arch.xcr0 != host_xcr0)
722                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
723                 vcpu->guest_xcr0_loaded = 1;
724         }
725 }
726
727 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
728 {
729         if (vcpu->guest_xcr0_loaded) {
730                 if (vcpu->arch.xcr0 != host_xcr0)
731                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
732                 vcpu->guest_xcr0_loaded = 0;
733         }
734 }
735
736 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
737 {
738         u64 xcr0 = xcr;
739         u64 old_xcr0 = vcpu->arch.xcr0;
740         u64 valid_bits;
741
742         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
743         if (index != XCR_XFEATURE_ENABLED_MASK)
744                 return 1;
745         if (!(xcr0 & XFEATURE_MASK_FP))
746                 return 1;
747         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
748                 return 1;
749
750         /*
751          * Do not allow the guest to set bits that we do not support
752          * saving.  However, xcr0 bit 0 is always set, even if the
753          * emulated CPU does not support XSAVE (see fx_init).
754          */
755         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
756         if (xcr0 & ~valid_bits)
757                 return 1;
758
759         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
760             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
761                 return 1;
762
763         if (xcr0 & XFEATURE_MASK_AVX512) {
764                 if (!(xcr0 & XFEATURE_MASK_YMM))
765                         return 1;
766                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
767                         return 1;
768         }
769         vcpu->arch.xcr0 = xcr0;
770
771         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
772                 kvm_update_cpuid(vcpu);
773         return 0;
774 }
775
776 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
777 {
778         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
779             __kvm_set_xcr(vcpu, index, xcr)) {
780                 kvm_inject_gp(vcpu, 0);
781                 return 1;
782         }
783         return 0;
784 }
785 EXPORT_SYMBOL_GPL(kvm_set_xcr);
786
787 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
788 {
789         unsigned long old_cr4 = kvm_read_cr4(vcpu);
790         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
791                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
792
793         if (cr4 & CR4_RESERVED_BITS)
794                 return 1;
795
796         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
797                 return 1;
798
799         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
800                 return 1;
801
802         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
803                 return 1;
804
805         if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
806                 return 1;
807
808         if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
809                 return 1;
810
811         if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
812                 return 1;
813
814         if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
815                 return 1;
816
817         if (is_long_mode(vcpu)) {
818                 if (!(cr4 & X86_CR4_PAE))
819                         return 1;
820         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
821                    && ((cr4 ^ old_cr4) & pdptr_bits)
822                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
823                                    kvm_read_cr3(vcpu)))
824                 return 1;
825
826         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
827                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
828                         return 1;
829
830                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
831                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
832                         return 1;
833         }
834
835         if (kvm_x86_ops->set_cr4(vcpu, cr4))
836                 return 1;
837
838         if (((cr4 ^ old_cr4) & pdptr_bits) ||
839             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
840                 kvm_mmu_reset_context(vcpu);
841
842         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
843                 kvm_update_cpuid(vcpu);
844
845         return 0;
846 }
847 EXPORT_SYMBOL_GPL(kvm_set_cr4);
848
849 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
850 {
851         bool skip_tlb_flush = false;
852 #ifdef CONFIG_X86_64
853         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
854
855         if (pcid_enabled) {
856                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
857                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
858         }
859 #endif
860
861         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
862                 if (!skip_tlb_flush) {
863                         kvm_mmu_sync_roots(vcpu);
864                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
865                 }
866                 return 0;
867         }
868
869         if (is_long_mode(vcpu) &&
870             (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
871                 return 1;
872         else if (is_pae(vcpu) && is_paging(vcpu) &&
873                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
874                 return 1;
875
876         kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
877         vcpu->arch.cr3 = cr3;
878         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
879
880         return 0;
881 }
882 EXPORT_SYMBOL_GPL(kvm_set_cr3);
883
884 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
885 {
886         if (cr8 & CR8_RESERVED_BITS)
887                 return 1;
888         if (lapic_in_kernel(vcpu))
889                 kvm_lapic_set_tpr(vcpu, cr8);
890         else
891                 vcpu->arch.cr8 = cr8;
892         return 0;
893 }
894 EXPORT_SYMBOL_GPL(kvm_set_cr8);
895
896 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
897 {
898         if (lapic_in_kernel(vcpu))
899                 return kvm_lapic_get_cr8(vcpu);
900         else
901                 return vcpu->arch.cr8;
902 }
903 EXPORT_SYMBOL_GPL(kvm_get_cr8);
904
905 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
906 {
907         int i;
908
909         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
910                 for (i = 0; i < KVM_NR_DB_REGS; i++)
911                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
912                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
913         }
914 }
915
916 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
917 {
918         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
919                 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
920 }
921
922 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
923 {
924         unsigned long dr7;
925
926         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
927                 dr7 = vcpu->arch.guest_debug_dr7;
928         else
929                 dr7 = vcpu->arch.dr7;
930         kvm_x86_ops->set_dr7(vcpu, dr7);
931         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
932         if (dr7 & DR7_BP_EN_MASK)
933                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
934 }
935
936 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
937 {
938         u64 fixed = DR6_FIXED_1;
939
940         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
941                 fixed |= DR6_RTM;
942         return fixed;
943 }
944
945 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
946 {
947         switch (dr) {
948         case 0 ... 3:
949                 vcpu->arch.db[dr] = val;
950                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
951                         vcpu->arch.eff_db[dr] = val;
952                 break;
953         case 4:
954                 /* fall through */
955         case 6:
956                 if (val & 0xffffffff00000000ULL)
957                         return -1; /* #GP */
958                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
959                 kvm_update_dr6(vcpu);
960                 break;
961         case 5:
962                 /* fall through */
963         default: /* 7 */
964                 if (val & 0xffffffff00000000ULL)
965                         return -1; /* #GP */
966                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
967                 kvm_update_dr7(vcpu);
968                 break;
969         }
970
971         return 0;
972 }
973
974 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
975 {
976         if (__kvm_set_dr(vcpu, dr, val)) {
977                 kvm_inject_gp(vcpu, 0);
978                 return 1;
979         }
980         return 0;
981 }
982 EXPORT_SYMBOL_GPL(kvm_set_dr);
983
984 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
985 {
986         switch (dr) {
987         case 0 ... 3:
988                 *val = vcpu->arch.db[dr];
989                 break;
990         case 4:
991                 /* fall through */
992         case 6:
993                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
994                         *val = vcpu->arch.dr6;
995                 else
996                         *val = kvm_x86_ops->get_dr6(vcpu);
997                 break;
998         case 5:
999                 /* fall through */
1000         default: /* 7 */
1001                 *val = vcpu->arch.dr7;
1002                 break;
1003         }
1004         return 0;
1005 }
1006 EXPORT_SYMBOL_GPL(kvm_get_dr);
1007
1008 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1009 {
1010         u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1011         u64 data;
1012         int err;
1013
1014         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1015         if (err)
1016                 return err;
1017         kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1018         kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1019         return err;
1020 }
1021 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1022
1023 /*
1024  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1025  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1026  *
1027  * This list is modified at module load time to reflect the
1028  * capabilities of the host cpu. This capabilities test skips MSRs that are
1029  * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1030  * may depend on host virtualization features rather than host cpu features.
1031  */
1032
1033 static u32 msrs_to_save[] = {
1034         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1035         MSR_STAR,
1036 #ifdef CONFIG_X86_64
1037         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1038 #endif
1039         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1040         MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1041         MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1042 };
1043
1044 static unsigned num_msrs_to_save;
1045
1046 static u32 emulated_msrs[] = {
1047         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1048         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1049         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1050         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1051         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1052         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1053         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1054         HV_X64_MSR_RESET,
1055         HV_X64_MSR_VP_INDEX,
1056         HV_X64_MSR_VP_RUNTIME,
1057         HV_X64_MSR_SCONTROL,
1058         HV_X64_MSR_STIMER0_CONFIG,
1059         HV_X64_MSR_VP_ASSIST_PAGE,
1060         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1061         HV_X64_MSR_TSC_EMULATION_STATUS,
1062
1063         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1064         MSR_KVM_PV_EOI_EN,
1065
1066         MSR_IA32_TSC_ADJUST,
1067         MSR_IA32_TSCDEADLINE,
1068         MSR_IA32_MISC_ENABLE,
1069         MSR_IA32_MCG_STATUS,
1070         MSR_IA32_MCG_CTL,
1071         MSR_IA32_MCG_EXT_CTL,
1072         MSR_IA32_SMBASE,
1073         MSR_SMI_COUNT,
1074         MSR_PLATFORM_INFO,
1075         MSR_MISC_FEATURES_ENABLES,
1076         MSR_AMD64_VIRT_SPEC_CTRL,
1077 };
1078
1079 static unsigned num_emulated_msrs;
1080
1081 /*
1082  * List of msr numbers which are used to expose MSR-based features that
1083  * can be used by a hypervisor to validate requested CPU features.
1084  */
1085 static u32 msr_based_features[] = {
1086         MSR_IA32_VMX_BASIC,
1087         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1088         MSR_IA32_VMX_PINBASED_CTLS,
1089         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1090         MSR_IA32_VMX_PROCBASED_CTLS,
1091         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1092         MSR_IA32_VMX_EXIT_CTLS,
1093         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1094         MSR_IA32_VMX_ENTRY_CTLS,
1095         MSR_IA32_VMX_MISC,
1096         MSR_IA32_VMX_CR0_FIXED0,
1097         MSR_IA32_VMX_CR0_FIXED1,
1098         MSR_IA32_VMX_CR4_FIXED0,
1099         MSR_IA32_VMX_CR4_FIXED1,
1100         MSR_IA32_VMX_VMCS_ENUM,
1101         MSR_IA32_VMX_PROCBASED_CTLS2,
1102         MSR_IA32_VMX_EPT_VPID_CAP,
1103         MSR_IA32_VMX_VMFUNC,
1104
1105         MSR_F10H_DECFG,
1106         MSR_IA32_UCODE_REV,
1107         MSR_IA32_ARCH_CAPABILITIES,
1108 };
1109
1110 static unsigned int num_msr_based_features;
1111
1112 u64 kvm_get_arch_capabilities(void)
1113 {
1114         u64 data;
1115
1116         rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1117
1118         /*
1119          * If we're doing cache flushes (either "always" or "cond")
1120          * we will do one whenever the guest does a vmlaunch/vmresume.
1121          * If an outer hypervisor is doing the cache flush for us
1122          * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1123          * capability to the guest too, and if EPT is disabled we're not
1124          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1125          * require a nested hypervisor to do a flush of its own.
1126          */
1127         if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1128                 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1129
1130         return data;
1131 }
1132 EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1133
1134 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1135 {
1136         switch (msr->index) {
1137         case MSR_IA32_ARCH_CAPABILITIES:
1138                 msr->data = kvm_get_arch_capabilities();
1139                 break;
1140         case MSR_IA32_UCODE_REV:
1141                 rdmsrl_safe(msr->index, &msr->data);
1142                 break;
1143         default:
1144                 if (kvm_x86_ops->get_msr_feature(msr))
1145                         return 1;
1146         }
1147         return 0;
1148 }
1149
1150 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1151 {
1152         struct kvm_msr_entry msr;
1153         int r;
1154
1155         msr.index = index;
1156         r = kvm_get_msr_feature(&msr);
1157         if (r)
1158                 return r;
1159
1160         *data = msr.data;
1161
1162         return 0;
1163 }
1164
1165 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1166 {
1167         if (efer & efer_reserved_bits)
1168                 return false;
1169
1170         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1171                         return false;
1172
1173         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1174                         return false;
1175
1176         return true;
1177 }
1178 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1179
1180 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
1181 {
1182         u64 old_efer = vcpu->arch.efer;
1183
1184         if (!kvm_valid_efer(vcpu, efer))
1185                 return 1;
1186
1187         if (is_paging(vcpu)
1188             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1189                 return 1;
1190
1191         efer &= ~EFER_LMA;
1192         efer |= vcpu->arch.efer & EFER_LMA;
1193
1194         kvm_x86_ops->set_efer(vcpu, efer);
1195
1196         /* Update reserved bits */
1197         if ((efer ^ old_efer) & EFER_NX)
1198                 kvm_mmu_reset_context(vcpu);
1199
1200         return 0;
1201 }
1202
1203 void kvm_enable_efer_bits(u64 mask)
1204 {
1205        efer_reserved_bits &= ~mask;
1206 }
1207 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1208
1209 /*
1210  * Writes msr value into into the appropriate "register".
1211  * Returns 0 on success, non-0 otherwise.
1212  * Assumes vcpu_load() was already called.
1213  */
1214 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1215 {
1216         switch (msr->index) {
1217         case MSR_FS_BASE:
1218         case MSR_GS_BASE:
1219         case MSR_KERNEL_GS_BASE:
1220         case MSR_CSTAR:
1221         case MSR_LSTAR:
1222                 if (is_noncanonical_address(msr->data, vcpu))
1223                         return 1;
1224                 break;
1225         case MSR_IA32_SYSENTER_EIP:
1226         case MSR_IA32_SYSENTER_ESP:
1227                 /*
1228                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1229                  * non-canonical address is written on Intel but not on
1230                  * AMD (which ignores the top 32-bits, because it does
1231                  * not implement 64-bit SYSENTER).
1232                  *
1233                  * 64-bit code should hence be able to write a non-canonical
1234                  * value on AMD.  Making the address canonical ensures that
1235                  * vmentry does not fail on Intel after writing a non-canonical
1236                  * value, and that something deterministic happens if the guest
1237                  * invokes 64-bit SYSENTER.
1238                  */
1239                 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1240         }
1241         return kvm_x86_ops->set_msr(vcpu, msr);
1242 }
1243 EXPORT_SYMBOL_GPL(kvm_set_msr);
1244
1245 /*
1246  * Adapt set_msr() to msr_io()'s calling convention
1247  */
1248 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1249 {
1250         struct msr_data msr;
1251         int r;
1252
1253         msr.index = index;
1254         msr.host_initiated = true;
1255         r = kvm_get_msr(vcpu, &msr);
1256         if (r)
1257                 return r;
1258
1259         *data = msr.data;
1260         return 0;
1261 }
1262
1263 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1264 {
1265         struct msr_data msr;
1266
1267         msr.data = *data;
1268         msr.index = index;
1269         msr.host_initiated = true;
1270         return kvm_set_msr(vcpu, &msr);
1271 }
1272
1273 #ifdef CONFIG_X86_64
1274 struct pvclock_gtod_data {
1275         seqcount_t      seq;
1276
1277         struct { /* extract of a clocksource struct */
1278                 int vclock_mode;
1279                 u64     cycle_last;
1280                 u64     mask;
1281                 u32     mult;
1282                 u32     shift;
1283         } clock;
1284
1285         u64             boot_ns;
1286         u64             nsec_base;
1287         u64             wall_time_sec;
1288 };
1289
1290 static struct pvclock_gtod_data pvclock_gtod_data;
1291
1292 static void update_pvclock_gtod(struct timekeeper *tk)
1293 {
1294         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1295         u64 boot_ns;
1296
1297         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1298
1299         write_seqcount_begin(&vdata->seq);
1300
1301         /* copy pvclock gtod data */
1302         vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
1303         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1304         vdata->clock.mask               = tk->tkr_mono.mask;
1305         vdata->clock.mult               = tk->tkr_mono.mult;
1306         vdata->clock.shift              = tk->tkr_mono.shift;
1307
1308         vdata->boot_ns                  = boot_ns;
1309         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
1310
1311         vdata->wall_time_sec            = tk->xtime_sec;
1312
1313         write_seqcount_end(&vdata->seq);
1314 }
1315 #endif
1316
1317 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1318 {
1319         /*
1320          * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1321          * vcpu_enter_guest.  This function is only called from
1322          * the physical CPU that is running vcpu.
1323          */
1324         kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1325 }
1326
1327 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1328 {
1329         int version;
1330         int r;
1331         struct pvclock_wall_clock wc;
1332         struct timespec64 boot;
1333
1334         if (!wall_clock)
1335                 return;
1336
1337         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1338         if (r)
1339                 return;
1340
1341         if (version & 1)
1342                 ++version;  /* first time write, random junk */
1343
1344         ++version;
1345
1346         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1347                 return;
1348
1349         /*
1350          * The guest calculates current wall clock time by adding
1351          * system time (updated by kvm_guest_time_update below) to the
1352          * wall clock specified here.  guest system time equals host
1353          * system time for us, thus we must fill in host boot time here.
1354          */
1355         getboottime64(&boot);
1356
1357         if (kvm->arch.kvmclock_offset) {
1358                 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1359                 boot = timespec64_sub(boot, ts);
1360         }
1361         wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1362         wc.nsec = boot.tv_nsec;
1363         wc.version = version;
1364
1365         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1366
1367         version++;
1368         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1369 }
1370
1371 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1372 {
1373         do_shl32_div32(dividend, divisor);
1374         return dividend;
1375 }
1376
1377 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1378                                s8 *pshift, u32 *pmultiplier)
1379 {
1380         uint64_t scaled64;
1381         int32_t  shift = 0;
1382         uint64_t tps64;
1383         uint32_t tps32;
1384
1385         tps64 = base_hz;
1386         scaled64 = scaled_hz;
1387         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1388                 tps64 >>= 1;
1389                 shift--;
1390         }
1391
1392         tps32 = (uint32_t)tps64;
1393         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1394                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1395                         scaled64 >>= 1;
1396                 else
1397                         tps32 <<= 1;
1398                 shift++;
1399         }
1400
1401         *pshift = shift;
1402         *pmultiplier = div_frac(scaled64, tps32);
1403
1404         pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1405                  __func__, base_hz, scaled_hz, shift, *pmultiplier);
1406 }
1407
1408 #ifdef CONFIG_X86_64
1409 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1410 #endif
1411
1412 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1413 static unsigned long max_tsc_khz;
1414
1415 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1416 {
1417         u64 v = (u64)khz * (1000000 + ppm);
1418         do_div(v, 1000000);
1419         return v;
1420 }
1421
1422 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1423 {
1424         u64 ratio;
1425
1426         /* Guest TSC same frequency as host TSC? */
1427         if (!scale) {
1428                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1429                 return 0;
1430         }
1431
1432         /* TSC scaling supported? */
1433         if (!kvm_has_tsc_control) {
1434                 if (user_tsc_khz > tsc_khz) {
1435                         vcpu->arch.tsc_catchup = 1;
1436                         vcpu->arch.tsc_always_catchup = 1;
1437                         return 0;
1438                 } else {
1439                         WARN(1, "user requested TSC rate below hardware speed\n");
1440                         return -1;
1441                 }
1442         }
1443
1444         /* TSC scaling required  - calculate ratio */
1445         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1446                                 user_tsc_khz, tsc_khz);
1447
1448         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1449                 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1450                           user_tsc_khz);
1451                 return -1;
1452         }
1453
1454         vcpu->arch.tsc_scaling_ratio = ratio;
1455         return 0;
1456 }
1457
1458 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1459 {
1460         u32 thresh_lo, thresh_hi;
1461         int use_scaling = 0;
1462
1463         /* tsc_khz can be zero if TSC calibration fails */
1464         if (user_tsc_khz == 0) {
1465                 /* set tsc_scaling_ratio to a safe value */
1466                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1467                 return -1;
1468         }
1469
1470         /* Compute a scale to convert nanoseconds in TSC cycles */
1471         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1472                            &vcpu->arch.virtual_tsc_shift,
1473                            &vcpu->arch.virtual_tsc_mult);
1474         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1475
1476         /*
1477          * Compute the variation in TSC rate which is acceptable
1478          * within the range of tolerance and decide if the
1479          * rate being applied is within that bounds of the hardware
1480          * rate.  If so, no scaling or compensation need be done.
1481          */
1482         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1483         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1484         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1485                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1486                 use_scaling = 1;
1487         }
1488         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1489 }
1490
1491 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1492 {
1493         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1494                                       vcpu->arch.virtual_tsc_mult,
1495                                       vcpu->arch.virtual_tsc_shift);
1496         tsc += vcpu->arch.this_tsc_write;
1497         return tsc;
1498 }
1499
1500 static inline int gtod_is_based_on_tsc(int mode)
1501 {
1502         return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1503 }
1504
1505 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1506 {
1507 #ifdef CONFIG_X86_64
1508         bool vcpus_matched;
1509         struct kvm_arch *ka = &vcpu->kvm->arch;
1510         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1511
1512         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1513                          atomic_read(&vcpu->kvm->online_vcpus));
1514
1515         /*
1516          * Once the masterclock is enabled, always perform request in
1517          * order to update it.
1518          *
1519          * In order to enable masterclock, the host clocksource must be TSC
1520          * and the vcpus need to have matched TSCs.  When that happens,
1521          * perform request to enable masterclock.
1522          */
1523         if (ka->use_master_clock ||
1524             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1525                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1526
1527         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1528                             atomic_read(&vcpu->kvm->online_vcpus),
1529                             ka->use_master_clock, gtod->clock.vclock_mode);
1530 #endif
1531 }
1532
1533 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1534 {
1535         u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1536         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1537 }
1538
1539 /*
1540  * Multiply tsc by a fixed point number represented by ratio.
1541  *
1542  * The most significant 64-N bits (mult) of ratio represent the
1543  * integral part of the fixed point number; the remaining N bits
1544  * (frac) represent the fractional part, ie. ratio represents a fixed
1545  * point number (mult + frac * 2^(-N)).
1546  *
1547  * N equals to kvm_tsc_scaling_ratio_frac_bits.
1548  */
1549 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1550 {
1551         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1552 }
1553
1554 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1555 {
1556         u64 _tsc = tsc;
1557         u64 ratio = vcpu->arch.tsc_scaling_ratio;
1558
1559         if (ratio != kvm_default_tsc_scaling_ratio)
1560                 _tsc = __scale_tsc(ratio, tsc);
1561
1562         return _tsc;
1563 }
1564 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1565
1566 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1567 {
1568         u64 tsc;
1569
1570         tsc = kvm_scale_tsc(vcpu, rdtsc());
1571
1572         return target_tsc - tsc;
1573 }
1574
1575 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1576 {
1577         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1578
1579         return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1580 }
1581 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1582
1583 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1584 {
1585         kvm_x86_ops->write_tsc_offset(vcpu, offset);
1586         vcpu->arch.tsc_offset = offset;
1587 }
1588
1589 static inline bool kvm_check_tsc_unstable(void)
1590 {
1591 #ifdef CONFIG_X86_64
1592         /*
1593          * TSC is marked unstable when we're running on Hyper-V,
1594          * 'TSC page' clocksource is good.
1595          */
1596         if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1597                 return false;
1598 #endif
1599         return check_tsc_unstable();
1600 }
1601
1602 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1603 {
1604         struct kvm *kvm = vcpu->kvm;
1605         u64 offset, ns, elapsed;
1606         unsigned long flags;
1607         bool matched;
1608         bool already_matched;
1609         u64 data = msr->data;
1610         bool synchronizing = false;
1611
1612         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1613         offset = kvm_compute_tsc_offset(vcpu, data);
1614         ns = ktime_get_boot_ns();
1615         elapsed = ns - kvm->arch.last_tsc_nsec;
1616
1617         if (vcpu->arch.virtual_tsc_khz) {
1618                 if (data == 0 && msr->host_initiated) {
1619                         /*
1620                          * detection of vcpu initialization -- need to sync
1621                          * with other vCPUs. This particularly helps to keep
1622                          * kvm_clock stable after CPU hotplug
1623                          */
1624                         synchronizing = true;
1625                 } else {
1626                         u64 tsc_exp = kvm->arch.last_tsc_write +
1627                                                 nsec_to_cycles(vcpu, elapsed);
1628                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1629                         /*
1630                          * Special case: TSC write with a small delta (1 second)
1631                          * of virtual cycle time against real time is
1632                          * interpreted as an attempt to synchronize the CPU.
1633                          */
1634                         synchronizing = data < tsc_exp + tsc_hz &&
1635                                         data + tsc_hz > tsc_exp;
1636                 }
1637         }
1638
1639         /*
1640          * For a reliable TSC, we can match TSC offsets, and for an unstable
1641          * TSC, we add elapsed time in this computation.  We could let the
1642          * compensation code attempt to catch up if we fall behind, but
1643          * it's better to try to match offsets from the beginning.
1644          */
1645         if (synchronizing &&
1646             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1647                 if (!kvm_check_tsc_unstable()) {
1648                         offset = kvm->arch.cur_tsc_offset;
1649                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1650                 } else {
1651                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1652                         data += delta;
1653                         offset = kvm_compute_tsc_offset(vcpu, data);
1654                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1655                 }
1656                 matched = true;
1657                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1658         } else {
1659                 /*
1660                  * We split periods of matched TSC writes into generations.
1661                  * For each generation, we track the original measured
1662                  * nanosecond time, offset, and write, so if TSCs are in
1663                  * sync, we can match exact offset, and if not, we can match
1664                  * exact software computation in compute_guest_tsc()
1665                  *
1666                  * These values are tracked in kvm->arch.cur_xxx variables.
1667                  */
1668                 kvm->arch.cur_tsc_generation++;
1669                 kvm->arch.cur_tsc_nsec = ns;
1670                 kvm->arch.cur_tsc_write = data;
1671                 kvm->arch.cur_tsc_offset = offset;
1672                 matched = false;
1673                 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1674                          kvm->arch.cur_tsc_generation, data);
1675         }
1676
1677         /*
1678          * We also track th most recent recorded KHZ, write and time to
1679          * allow the matching interval to be extended at each write.
1680          */
1681         kvm->arch.last_tsc_nsec = ns;
1682         kvm->arch.last_tsc_write = data;
1683         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1684
1685         vcpu->arch.last_guest_tsc = data;
1686
1687         /* Keep track of which generation this VCPU has synchronized to */
1688         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1689         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1690         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1691
1692         if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1693                 update_ia32_tsc_adjust_msr(vcpu, offset);
1694
1695         kvm_vcpu_write_tsc_offset(vcpu, offset);
1696         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1697
1698         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1699         if (!matched) {
1700                 kvm->arch.nr_vcpus_matched_tsc = 0;
1701         } else if (!already_matched) {
1702                 kvm->arch.nr_vcpus_matched_tsc++;
1703         }
1704
1705         kvm_track_tsc_matching(vcpu);
1706         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1707 }
1708
1709 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1710
1711 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1712                                            s64 adjustment)
1713 {
1714         kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
1715 }
1716
1717 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1718 {
1719         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1720                 WARN_ON(adjustment < 0);
1721         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1722         adjust_tsc_offset_guest(vcpu, adjustment);
1723 }
1724
1725 #ifdef CONFIG_X86_64
1726
1727 static u64 read_tsc(void)
1728 {
1729         u64 ret = (u64)rdtsc_ordered();
1730         u64 last = pvclock_gtod_data.clock.cycle_last;
1731
1732         if (likely(ret >= last))
1733                 return ret;
1734
1735         /*
1736          * GCC likes to generate cmov here, but this branch is extremely
1737          * predictable (it's just a function of time and the likely is
1738          * very likely) and there's a data dependence, so force GCC
1739          * to generate a branch instead.  I don't barrier() because
1740          * we don't actually need a barrier, and if this function
1741          * ever gets inlined it will generate worse code.
1742          */
1743         asm volatile ("");
1744         return last;
1745 }
1746
1747 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1748 {
1749         long v;
1750         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1751         u64 tsc_pg_val;
1752
1753         switch (gtod->clock.vclock_mode) {
1754         case VCLOCK_HVCLOCK:
1755                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1756                                                   tsc_timestamp);
1757                 if (tsc_pg_val != U64_MAX) {
1758                         /* TSC page valid */
1759                         *mode = VCLOCK_HVCLOCK;
1760                         v = (tsc_pg_val - gtod->clock.cycle_last) &
1761                                 gtod->clock.mask;
1762                 } else {
1763                         /* TSC page invalid */
1764                         *mode = VCLOCK_NONE;
1765                 }
1766                 break;
1767         case VCLOCK_TSC:
1768                 *mode = VCLOCK_TSC;
1769                 *tsc_timestamp = read_tsc();
1770                 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1771                         gtod->clock.mask;
1772                 break;
1773         default:
1774                 *mode = VCLOCK_NONE;
1775         }
1776
1777         if (*mode == VCLOCK_NONE)
1778                 *tsc_timestamp = v = 0;
1779
1780         return v * gtod->clock.mult;
1781 }
1782
1783 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1784 {
1785         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1786         unsigned long seq;
1787         int mode;
1788         u64 ns;
1789
1790         do {
1791                 seq = read_seqcount_begin(&gtod->seq);
1792                 ns = gtod->nsec_base;
1793                 ns += vgettsc(tsc_timestamp, &mode);
1794                 ns >>= gtod->clock.shift;
1795                 ns += gtod->boot_ns;
1796         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1797         *t = ns;
1798
1799         return mode;
1800 }
1801
1802 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
1803 {
1804         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1805         unsigned long seq;
1806         int mode;
1807         u64 ns;
1808
1809         do {
1810                 seq = read_seqcount_begin(&gtod->seq);
1811                 ts->tv_sec = gtod->wall_time_sec;
1812                 ns = gtod->nsec_base;
1813                 ns += vgettsc(tsc_timestamp, &mode);
1814                 ns >>= gtod->clock.shift;
1815         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1816
1817         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1818         ts->tv_nsec = ns;
1819
1820         return mode;
1821 }
1822
1823 /* returns true if host is using TSC based clocksource */
1824 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1825 {
1826         /* checked again under seqlock below */
1827         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1828                 return false;
1829
1830         return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1831                                                       tsc_timestamp));
1832 }
1833
1834 /* returns true if host is using TSC based clocksource */
1835 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
1836                                            u64 *tsc_timestamp)
1837 {
1838         /* checked again under seqlock below */
1839         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1840                 return false;
1841
1842         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1843 }
1844 #endif
1845
1846 /*
1847  *
1848  * Assuming a stable TSC across physical CPUS, and a stable TSC
1849  * across virtual CPUs, the following condition is possible.
1850  * Each numbered line represents an event visible to both
1851  * CPUs at the next numbered event.
1852  *
1853  * "timespecX" represents host monotonic time. "tscX" represents
1854  * RDTSC value.
1855  *
1856  *              VCPU0 on CPU0           |       VCPU1 on CPU1
1857  *
1858  * 1.  read timespec0,tsc0
1859  * 2.                                   | timespec1 = timespec0 + N
1860  *                                      | tsc1 = tsc0 + M
1861  * 3. transition to guest               | transition to guest
1862  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1863  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1864  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1865  *
1866  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1867  *
1868  *      - ret0 < ret1
1869  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1870  *              ...
1871  *      - 0 < N - M => M < N
1872  *
1873  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1874  * always the case (the difference between two distinct xtime instances
1875  * might be smaller then the difference between corresponding TSC reads,
1876  * when updating guest vcpus pvclock areas).
1877  *
1878  * To avoid that problem, do not allow visibility of distinct
1879  * system_timestamp/tsc_timestamp values simultaneously: use a master
1880  * copy of host monotonic time values. Update that master copy
1881  * in lockstep.
1882  *
1883  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1884  *
1885  */
1886
1887 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1888 {
1889 #ifdef CONFIG_X86_64
1890         struct kvm_arch *ka = &kvm->arch;
1891         int vclock_mode;
1892         bool host_tsc_clocksource, vcpus_matched;
1893
1894         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1895                         atomic_read(&kvm->online_vcpus));
1896
1897         /*
1898          * If the host uses TSC clock, then passthrough TSC as stable
1899          * to the guest.
1900          */
1901         host_tsc_clocksource = kvm_get_time_and_clockread(
1902                                         &ka->master_kernel_ns,
1903                                         &ka->master_cycle_now);
1904
1905         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1906                                 && !ka->backwards_tsc_observed
1907                                 && !ka->boot_vcpu_runs_old_kvmclock;
1908
1909         if (ka->use_master_clock)
1910                 atomic_set(&kvm_guest_has_master_clock, 1);
1911
1912         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1913         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1914                                         vcpus_matched);
1915 #endif
1916 }
1917
1918 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
1919 {
1920         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
1921 }
1922
1923 static void kvm_gen_update_masterclock(struct kvm *kvm)
1924 {
1925 #ifdef CONFIG_X86_64
1926         int i;
1927         struct kvm_vcpu *vcpu;
1928         struct kvm_arch *ka = &kvm->arch;
1929
1930         spin_lock(&ka->pvclock_gtod_sync_lock);
1931         kvm_make_mclock_inprogress_request(kvm);
1932         /* no guest entries from this point */
1933         pvclock_update_vm_gtod_copy(kvm);
1934
1935         kvm_for_each_vcpu(i, vcpu, kvm)
1936                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1937
1938         /* guest entries allowed */
1939         kvm_for_each_vcpu(i, vcpu, kvm)
1940                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
1941
1942         spin_unlock(&ka->pvclock_gtod_sync_lock);
1943 #endif
1944 }
1945
1946 u64 get_kvmclock_ns(struct kvm *kvm)
1947 {
1948         struct kvm_arch *ka = &kvm->arch;
1949         struct pvclock_vcpu_time_info hv_clock;
1950         u64 ret;
1951
1952         spin_lock(&ka->pvclock_gtod_sync_lock);
1953         if (!ka->use_master_clock) {
1954                 spin_unlock(&ka->pvclock_gtod_sync_lock);
1955                 return ktime_get_boot_ns() + ka->kvmclock_offset;
1956         }
1957
1958         hv_clock.tsc_timestamp = ka->master_cycle_now;
1959         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
1960         spin_unlock(&ka->pvclock_gtod_sync_lock);
1961
1962         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
1963         get_cpu();
1964
1965         if (__this_cpu_read(cpu_tsc_khz)) {
1966                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
1967                                    &hv_clock.tsc_shift,
1968                                    &hv_clock.tsc_to_system_mul);
1969                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
1970         } else
1971                 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
1972
1973         put_cpu();
1974
1975         return ret;
1976 }
1977
1978 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1979 {
1980         struct kvm_vcpu_arch *vcpu = &v->arch;
1981         struct pvclock_vcpu_time_info guest_hv_clock;
1982
1983         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1984                 &guest_hv_clock, sizeof(guest_hv_clock))))
1985                 return;
1986
1987         /* This VCPU is paused, but it's legal for a guest to read another
1988          * VCPU's kvmclock, so we really have to follow the specification where
1989          * it says that version is odd if data is being modified, and even after
1990          * it is consistent.
1991          *
1992          * Version field updates must be kept separate.  This is because
1993          * kvm_write_guest_cached might use a "rep movs" instruction, and
1994          * writes within a string instruction are weakly ordered.  So there
1995          * are three writes overall.
1996          *
1997          * As a small optimization, only write the version field in the first
1998          * and third write.  The vcpu->pv_time cache is still valid, because the
1999          * version field is the first in the struct.
2000          */
2001         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2002
2003         if (guest_hv_clock.version & 1)
2004                 ++guest_hv_clock.version;  /* first time write, random junk */
2005
2006         vcpu->hv_clock.version = guest_hv_clock.version + 1;
2007         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2008                                 &vcpu->hv_clock,
2009                                 sizeof(vcpu->hv_clock.version));
2010
2011         smp_wmb();
2012
2013         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2014         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2015
2016         if (vcpu->pvclock_set_guest_stopped_request) {
2017                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2018                 vcpu->pvclock_set_guest_stopped_request = false;
2019         }
2020
2021         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2022
2023         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2024                                 &vcpu->hv_clock,
2025                                 sizeof(vcpu->hv_clock));
2026
2027         smp_wmb();
2028
2029         vcpu->hv_clock.version++;
2030         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2031                                 &vcpu->hv_clock,
2032                                 sizeof(vcpu->hv_clock.version));
2033 }
2034
2035 static int kvm_guest_time_update(struct kvm_vcpu *v)
2036 {
2037         unsigned long flags, tgt_tsc_khz;
2038         struct kvm_vcpu_arch *vcpu = &v->arch;
2039         struct kvm_arch *ka = &v->kvm->arch;
2040         s64 kernel_ns;
2041         u64 tsc_timestamp, host_tsc;
2042         u8 pvclock_flags;
2043         bool use_master_clock;
2044
2045         kernel_ns = 0;
2046         host_tsc = 0;
2047
2048         /*
2049          * If the host uses TSC clock, then passthrough TSC as stable
2050          * to the guest.
2051          */
2052         spin_lock(&ka->pvclock_gtod_sync_lock);
2053         use_master_clock = ka->use_master_clock;
2054         if (use_master_clock) {
2055                 host_tsc = ka->master_cycle_now;
2056                 kernel_ns = ka->master_kernel_ns;
2057         }
2058         spin_unlock(&ka->pvclock_gtod_sync_lock);
2059
2060         /* Keep irq disabled to prevent changes to the clock */
2061         local_irq_save(flags);
2062         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2063         if (unlikely(tgt_tsc_khz == 0)) {
2064                 local_irq_restore(flags);
2065                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2066                 return 1;
2067         }
2068         if (!use_master_clock) {
2069                 host_tsc = rdtsc();
2070                 kernel_ns = ktime_get_boot_ns();
2071         }
2072
2073         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2074
2075         /*
2076          * We may have to catch up the TSC to match elapsed wall clock
2077          * time for two reasons, even if kvmclock is used.
2078          *   1) CPU could have been running below the maximum TSC rate
2079          *   2) Broken TSC compensation resets the base at each VCPU
2080          *      entry to avoid unknown leaps of TSC even when running
2081          *      again on the same CPU.  This may cause apparent elapsed
2082          *      time to disappear, and the guest to stand still or run
2083          *      very slowly.
2084          */
2085         if (vcpu->tsc_catchup) {
2086                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2087                 if (tsc > tsc_timestamp) {
2088                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2089                         tsc_timestamp = tsc;
2090                 }
2091         }
2092
2093         local_irq_restore(flags);
2094
2095         /* With all the info we got, fill in the values */
2096
2097         if (kvm_has_tsc_control)
2098                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2099
2100         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2101                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2102                                    &vcpu->hv_clock.tsc_shift,
2103                                    &vcpu->hv_clock.tsc_to_system_mul);
2104                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2105         }
2106
2107         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2108         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2109         vcpu->last_guest_tsc = tsc_timestamp;
2110
2111         /* If the host uses TSC clocksource, then it is stable */
2112         pvclock_flags = 0;
2113         if (use_master_clock)
2114                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2115
2116         vcpu->hv_clock.flags = pvclock_flags;
2117
2118         if (vcpu->pv_time_enabled)
2119                 kvm_setup_pvclock_page(v);
2120         if (v == kvm_get_vcpu(v->kvm, 0))
2121                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2122         return 0;
2123 }
2124
2125 /*
2126  * kvmclock updates which are isolated to a given vcpu, such as
2127  * vcpu->cpu migration, should not allow system_timestamp from
2128  * the rest of the vcpus to remain static. Otherwise ntp frequency
2129  * correction applies to one vcpu's system_timestamp but not
2130  * the others.
2131  *
2132  * So in those cases, request a kvmclock update for all vcpus.
2133  * We need to rate-limit these requests though, as they can
2134  * considerably slow guests that have a large number of vcpus.
2135  * The time for a remote vcpu to update its kvmclock is bound
2136  * by the delay we use to rate-limit the updates.
2137  */
2138
2139 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2140
2141 static void kvmclock_update_fn(struct work_struct *work)
2142 {
2143         int i;
2144         struct delayed_work *dwork = to_delayed_work(work);
2145         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2146                                            kvmclock_update_work);
2147         struct kvm *kvm = container_of(ka, struct kvm, arch);
2148         struct kvm_vcpu *vcpu;
2149
2150         kvm_for_each_vcpu(i, vcpu, kvm) {
2151                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2152                 kvm_vcpu_kick(vcpu);
2153         }
2154 }
2155
2156 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2157 {
2158         struct kvm *kvm = v->kvm;
2159
2160         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2161         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2162                                         KVMCLOCK_UPDATE_DELAY);
2163 }
2164
2165 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2166
2167 static void kvmclock_sync_fn(struct work_struct *work)
2168 {
2169         struct delayed_work *dwork = to_delayed_work(work);
2170         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2171                                            kvmclock_sync_work);
2172         struct kvm *kvm = container_of(ka, struct kvm, arch);
2173
2174         if (!kvmclock_periodic_sync)
2175                 return;
2176
2177         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2178         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2179                                         KVMCLOCK_SYNC_PERIOD);
2180 }
2181
2182 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2183 {
2184         u64 mcg_cap = vcpu->arch.mcg_cap;
2185         unsigned bank_num = mcg_cap & 0xff;
2186         u32 msr = msr_info->index;
2187         u64 data = msr_info->data;
2188
2189         switch (msr) {
2190         case MSR_IA32_MCG_STATUS:
2191                 vcpu->arch.mcg_status = data;
2192                 break;
2193         case MSR_IA32_MCG_CTL:
2194                 if (!(mcg_cap & MCG_CTL_P) &&
2195                     (data || !msr_info->host_initiated))
2196                         return 1;
2197                 if (data != 0 && data != ~(u64)0)
2198                         return 1;
2199                 vcpu->arch.mcg_ctl = data;
2200                 break;
2201         default:
2202                 if (msr >= MSR_IA32_MC0_CTL &&
2203                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2204                         u32 offset = msr - MSR_IA32_MC0_CTL;
2205                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2206                          * some Linux kernels though clear bit 10 in bank 4 to
2207                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2208                          * this to avoid an uncatched #GP in the guest
2209                          */
2210                         if ((offset & 0x3) == 0 &&
2211                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2212                                 return -1;
2213                         if (!msr_info->host_initiated &&
2214                                 (offset & 0x3) == 1 && data != 0)
2215                                 return -1;
2216                         vcpu->arch.mce_banks[offset] = data;
2217                         break;
2218                 }
2219                 return 1;
2220         }
2221         return 0;
2222 }
2223
2224 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2225 {
2226         struct kvm *kvm = vcpu->kvm;
2227         int lm = is_long_mode(vcpu);
2228         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2229                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2230         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2231                 : kvm->arch.xen_hvm_config.blob_size_32;
2232         u32 page_num = data & ~PAGE_MASK;
2233         u64 page_addr = data & PAGE_MASK;
2234         u8 *page;
2235         int r;
2236
2237         r = -E2BIG;
2238         if (page_num >= blob_size)
2239                 goto out;
2240         r = -ENOMEM;
2241         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2242         if (IS_ERR(page)) {
2243                 r = PTR_ERR(page);
2244                 goto out;
2245         }
2246         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2247                 goto out_free;
2248         r = 0;
2249 out_free:
2250         kfree(page);
2251 out:
2252         return r;
2253 }
2254
2255 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2256 {
2257         gpa_t gpa = data & ~0x3f;
2258
2259         /* Bits 3:5 are reserved, Should be zero */
2260         if (data & 0x38)
2261                 return 1;
2262
2263         vcpu->arch.apf.msr_val = data;
2264
2265         if (!(data & KVM_ASYNC_PF_ENABLED)) {
2266                 kvm_clear_async_pf_completion_queue(vcpu);
2267                 kvm_async_pf_hash_reset(vcpu);
2268                 return 0;
2269         }
2270
2271         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2272                                         sizeof(u32)))
2273                 return 1;
2274
2275         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2276         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2277         kvm_async_pf_wakeup_all(vcpu);
2278         return 0;
2279 }
2280
2281 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2282 {
2283         vcpu->arch.pv_time_enabled = false;
2284 }
2285
2286 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2287 {
2288         ++vcpu->stat.tlb_flush;
2289         kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2290 }
2291
2292 static void record_steal_time(struct kvm_vcpu *vcpu)
2293 {
2294         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2295                 return;
2296
2297         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2298                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2299                 return;
2300
2301         /*
2302          * Doing a TLB flush here, on the guest's behalf, can avoid
2303          * expensive IPIs.
2304          */
2305         if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2306                 kvm_vcpu_flush_tlb(vcpu, false);
2307
2308         if (vcpu->arch.st.steal.version & 1)
2309                 vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
2310
2311         vcpu->arch.st.steal.version += 1;
2312
2313         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2314                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2315
2316         smp_wmb();
2317
2318         vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2319                 vcpu->arch.st.last_steal;
2320         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2321
2322         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2323                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2324
2325         smp_wmb();
2326
2327         vcpu->arch.st.steal.version += 1;
2328
2329         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2330                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2331 }
2332
2333 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2334 {
2335         bool pr = false;
2336         u32 msr = msr_info->index;
2337         u64 data = msr_info->data;
2338
2339         switch (msr) {
2340         case MSR_AMD64_NB_CFG:
2341         case MSR_IA32_UCODE_WRITE:
2342         case MSR_VM_HSAVE_PA:
2343         case MSR_AMD64_PATCH_LOADER:
2344         case MSR_AMD64_BU_CFG2:
2345         case MSR_AMD64_DC_CFG:
2346                 break;
2347
2348         case MSR_IA32_UCODE_REV:
2349                 if (msr_info->host_initiated)
2350                         vcpu->arch.microcode_version = data;
2351                 break;
2352         case MSR_EFER:
2353                 return set_efer(vcpu, data);
2354         case MSR_K7_HWCR:
2355                 data &= ~(u64)0x40;     /* ignore flush filter disable */
2356                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2357                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
2358                 data &= ~(u64)0x40000;  /* ignore Mc status write enable */
2359                 if (data != 0) {
2360                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2361                                     data);
2362                         return 1;
2363                 }
2364                 break;
2365         case MSR_FAM10H_MMIO_CONF_BASE:
2366                 if (data != 0) {
2367                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2368                                     "0x%llx\n", data);
2369                         return 1;
2370                 }
2371                 break;
2372         case MSR_IA32_DEBUGCTLMSR:
2373                 if (!data) {
2374                         /* We support the non-activated case already */
2375                         break;
2376                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2377                         /* Values other than LBR and BTF are vendor-specific,
2378                            thus reserved and should throw a #GP */
2379                         return 1;
2380                 }
2381                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2382                             __func__, data);
2383                 break;
2384         case 0x200 ... 0x2ff:
2385                 return kvm_mtrr_set_msr(vcpu, msr, data);
2386         case MSR_IA32_APICBASE:
2387                 return kvm_set_apic_base(vcpu, msr_info);
2388         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2389                 return kvm_x2apic_msr_write(vcpu, msr, data);
2390         case MSR_IA32_TSCDEADLINE:
2391                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2392                 break;
2393         case MSR_IA32_TSC_ADJUST:
2394                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2395                         if (!msr_info->host_initiated) {
2396                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2397                                 adjust_tsc_offset_guest(vcpu, adj);
2398                         }
2399                         vcpu->arch.ia32_tsc_adjust_msr = data;
2400                 }
2401                 break;
2402         case MSR_IA32_MISC_ENABLE:
2403                 vcpu->arch.ia32_misc_enable_msr = data;
2404                 break;
2405         case MSR_IA32_SMBASE:
2406                 if (!msr_info->host_initiated)
2407                         return 1;
2408                 vcpu->arch.smbase = data;
2409                 break;
2410         case MSR_IA32_TSC:
2411                 kvm_write_tsc(vcpu, msr_info);
2412                 break;
2413         case MSR_SMI_COUNT:
2414                 if (!msr_info->host_initiated)
2415                         return 1;
2416                 vcpu->arch.smi_count = data;
2417                 break;
2418         case MSR_KVM_WALL_CLOCK_NEW:
2419         case MSR_KVM_WALL_CLOCK:
2420                 vcpu->kvm->arch.wall_clock = data;
2421                 kvm_write_wall_clock(vcpu->kvm, data);
2422                 break;
2423         case MSR_KVM_SYSTEM_TIME_NEW:
2424         case MSR_KVM_SYSTEM_TIME: {
2425                 struct kvm_arch *ka = &vcpu->kvm->arch;
2426
2427                 kvmclock_reset(vcpu);
2428
2429                 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2430                         bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2431
2432                         if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2433                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2434
2435                         ka->boot_vcpu_runs_old_kvmclock = tmp;
2436                 }
2437
2438                 vcpu->arch.time = data;
2439                 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2440
2441                 /* we verify if the enable bit is set... */
2442                 if (!(data & 1))
2443                         break;
2444
2445                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2446                      &vcpu->arch.pv_time, data & ~1ULL,
2447                      sizeof(struct pvclock_vcpu_time_info)))
2448                         vcpu->arch.pv_time_enabled = false;
2449                 else
2450                         vcpu->arch.pv_time_enabled = true;
2451
2452                 break;
2453         }
2454         case MSR_KVM_ASYNC_PF_EN:
2455                 if (kvm_pv_enable_async_pf(vcpu, data))
2456                         return 1;
2457                 break;
2458         case MSR_KVM_STEAL_TIME:
2459
2460                 if (unlikely(!sched_info_on()))
2461                         return 1;
2462
2463                 if (data & KVM_STEAL_RESERVED_MASK)
2464                         return 1;
2465
2466                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2467                                                 data & KVM_STEAL_VALID_BITS,
2468                                                 sizeof(struct kvm_steal_time)))
2469                         return 1;
2470
2471                 vcpu->arch.st.msr_val = data;
2472
2473                 if (!(data & KVM_MSR_ENABLED))
2474                         break;
2475
2476                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2477
2478                 break;
2479         case MSR_KVM_PV_EOI_EN:
2480                 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2481                         return 1;
2482                 break;
2483
2484         case MSR_IA32_MCG_CTL:
2485         case MSR_IA32_MCG_STATUS:
2486         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2487                 return set_msr_mce(vcpu, msr_info);
2488
2489         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2490         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2491                 pr = true; /* fall through */
2492         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2493         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2494                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2495                         return kvm_pmu_set_msr(vcpu, msr_info);
2496
2497                 if (pr || data != 0)
2498                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2499                                     "0x%x data 0x%llx\n", msr, data);
2500                 break;
2501         case MSR_K7_CLK_CTL:
2502                 /*
2503                  * Ignore all writes to this no longer documented MSR.
2504                  * Writes are only relevant for old K7 processors,
2505                  * all pre-dating SVM, but a recommended workaround from
2506                  * AMD for these chips. It is possible to specify the
2507                  * affected processor models on the command line, hence
2508                  * the need to ignore the workaround.
2509                  */
2510                 break;
2511         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2512         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2513         case HV_X64_MSR_CRASH_CTL:
2514         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2515         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2516         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2517         case HV_X64_MSR_TSC_EMULATION_STATUS:
2518                 return kvm_hv_set_msr_common(vcpu, msr, data,
2519                                              msr_info->host_initiated);
2520         case MSR_IA32_BBL_CR_CTL3:
2521                 /* Drop writes to this legacy MSR -- see rdmsr
2522                  * counterpart for further detail.
2523                  */
2524                 if (report_ignored_msrs)
2525                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2526                                 msr, data);
2527                 break;
2528         case MSR_AMD64_OSVW_ID_LENGTH:
2529                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2530                         return 1;
2531                 vcpu->arch.osvw.length = data;
2532                 break;
2533         case MSR_AMD64_OSVW_STATUS:
2534                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2535                         return 1;
2536                 vcpu->arch.osvw.status = data;
2537                 break;
2538         case MSR_PLATFORM_INFO:
2539                 if (!msr_info->host_initiated ||
2540                     data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
2541                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2542                      cpuid_fault_enabled(vcpu)))
2543                         return 1;
2544                 vcpu->arch.msr_platform_info = data;
2545                 break;
2546         case MSR_MISC_FEATURES_ENABLES:
2547                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2548                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2549                      !supports_cpuid_fault(vcpu)))
2550                         return 1;
2551                 vcpu->arch.msr_misc_features_enables = data;
2552                 break;
2553         default:
2554                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2555                         return xen_hvm_config(vcpu, data);
2556                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2557                         return kvm_pmu_set_msr(vcpu, msr_info);
2558                 if (!ignore_msrs) {
2559                         vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2560                                     msr, data);
2561                         return 1;
2562                 } else {
2563                         if (report_ignored_msrs)
2564                                 vcpu_unimpl(vcpu,
2565                                         "ignored wrmsr: 0x%x data 0x%llx\n",
2566                                         msr, data);
2567                         break;
2568                 }
2569         }
2570         return 0;
2571 }
2572 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2573
2574
2575 /*
2576  * Reads an msr value (of 'msr_index') into 'pdata'.
2577  * Returns 0 on success, non-0 otherwise.
2578  * Assumes vcpu_load() was already called.
2579  */
2580 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2581 {
2582         return kvm_x86_ops->get_msr(vcpu, msr);
2583 }
2584 EXPORT_SYMBOL_GPL(kvm_get_msr);
2585
2586 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
2587 {
2588         u64 data;
2589         u64 mcg_cap = vcpu->arch.mcg_cap;
2590         unsigned bank_num = mcg_cap & 0xff;
2591
2592         switch (msr) {
2593         case MSR_IA32_P5_MC_ADDR:
2594         case MSR_IA32_P5_MC_TYPE:
2595                 data = 0;
2596                 break;
2597         case MSR_IA32_MCG_CAP:
2598                 data = vcpu->arch.mcg_cap;
2599                 break;
2600         case MSR_IA32_MCG_CTL:
2601                 if (!(mcg_cap & MCG_CTL_P) && !host)
2602                         return 1;
2603                 data = vcpu->arch.mcg_ctl;
2604                 break;
2605         case MSR_IA32_MCG_STATUS:
2606                 data = vcpu->arch.mcg_status;
2607                 break;
2608         default:
2609                 if (msr >= MSR_IA32_MC0_CTL &&
2610                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2611                         u32 offset = msr - MSR_IA32_MC0_CTL;
2612                         data = vcpu->arch.mce_banks[offset];
2613                         break;
2614                 }
2615                 return 1;
2616         }
2617         *pdata = data;
2618         return 0;
2619 }
2620
2621 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2622 {
2623         switch (msr_info->index) {
2624         case MSR_IA32_PLATFORM_ID:
2625         case MSR_IA32_EBL_CR_POWERON:
2626         case MSR_IA32_DEBUGCTLMSR:
2627         case MSR_IA32_LASTBRANCHFROMIP:
2628         case MSR_IA32_LASTBRANCHTOIP:
2629         case MSR_IA32_LASTINTFROMIP:
2630         case MSR_IA32_LASTINTTOIP:
2631         case MSR_K8_SYSCFG:
2632         case MSR_K8_TSEG_ADDR:
2633         case MSR_K8_TSEG_MASK:
2634         case MSR_K7_HWCR:
2635         case MSR_VM_HSAVE_PA:
2636         case MSR_K8_INT_PENDING_MSG:
2637         case MSR_AMD64_NB_CFG:
2638         case MSR_FAM10H_MMIO_CONF_BASE:
2639         case MSR_AMD64_BU_CFG2:
2640         case MSR_IA32_PERF_CTL:
2641         case MSR_AMD64_DC_CFG:
2642                 msr_info->data = 0;
2643                 break;
2644         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2645         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2646         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2647         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2648         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2649                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2650                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2651                 msr_info->data = 0;
2652                 break;
2653         case MSR_IA32_UCODE_REV:
2654                 msr_info->data = vcpu->arch.microcode_version;
2655                 break;
2656         case MSR_IA32_TSC:
2657                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2658                 break;
2659         case MSR_MTRRcap:
2660         case 0x200 ... 0x2ff:
2661                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2662         case 0xcd: /* fsb frequency */
2663                 msr_info->data = 3;
2664                 break;
2665                 /*
2666                  * MSR_EBC_FREQUENCY_ID
2667                  * Conservative value valid for even the basic CPU models.
2668                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2669                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2670                  * and 266MHz for model 3, or 4. Set Core Clock
2671                  * Frequency to System Bus Frequency Ratio to 1 (bits
2672                  * 31:24) even though these are only valid for CPU
2673                  * models > 2, however guests may end up dividing or
2674                  * multiplying by zero otherwise.
2675                  */
2676         case MSR_EBC_FREQUENCY_ID:
2677                 msr_info->data = 1 << 24;
2678                 break;
2679         case MSR_IA32_APICBASE:
2680                 msr_info->data = kvm_get_apic_base(vcpu);
2681                 break;
2682         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2683                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2684                 break;
2685         case MSR_IA32_TSCDEADLINE:
2686                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2687                 break;
2688         case MSR_IA32_TSC_ADJUST:
2689                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2690                 break;
2691         case MSR_IA32_MISC_ENABLE:
2692                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2693                 break;
2694         case MSR_IA32_SMBASE:
2695                 if (!msr_info->host_initiated)
2696                         return 1;
2697                 msr_info->data = vcpu->arch.smbase;
2698                 break;
2699         case MSR_SMI_COUNT:
2700                 msr_info->data = vcpu->arch.smi_count;
2701                 break;
2702         case MSR_IA32_PERF_STATUS:
2703                 /* TSC increment by tick */
2704                 msr_info->data = 1000ULL;
2705                 /* CPU multiplier */
2706                 msr_info->data |= (((uint64_t)4ULL) << 40);
2707                 break;
2708         case MSR_EFER:
2709                 msr_info->data = vcpu->arch.efer;
2710                 break;
2711         case MSR_KVM_WALL_CLOCK:
2712         case MSR_KVM_WALL_CLOCK_NEW:
2713                 msr_info->data = vcpu->kvm->arch.wall_clock;
2714                 break;
2715         case MSR_KVM_SYSTEM_TIME:
2716         case MSR_KVM_SYSTEM_TIME_NEW:
2717                 msr_info->data = vcpu->arch.time;
2718                 break;
2719         case MSR_KVM_ASYNC_PF_EN:
2720                 msr_info->data = vcpu->arch.apf.msr_val;
2721                 break;
2722         case MSR_KVM_STEAL_TIME:
2723                 msr_info->data = vcpu->arch.st.msr_val;
2724                 break;
2725         case MSR_KVM_PV_EOI_EN:
2726                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2727                 break;
2728         case MSR_IA32_P5_MC_ADDR:
2729         case MSR_IA32_P5_MC_TYPE:
2730         case MSR_IA32_MCG_CAP:
2731         case MSR_IA32_MCG_CTL:
2732         case MSR_IA32_MCG_STATUS:
2733         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2734                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2735                                    msr_info->host_initiated);
2736         case MSR_K7_CLK_CTL:
2737                 /*
2738                  * Provide expected ramp-up count for K7. All other
2739                  * are set to zero, indicating minimum divisors for
2740                  * every field.
2741                  *
2742                  * This prevents guest kernels on AMD host with CPU
2743                  * type 6, model 8 and higher from exploding due to
2744                  * the rdmsr failing.
2745                  */
2746                 msr_info->data = 0x20000000;
2747                 break;
2748         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2749         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2750         case HV_X64_MSR_CRASH_CTL:
2751         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2752         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2753         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2754         case HV_X64_MSR_TSC_EMULATION_STATUS:
2755                 return kvm_hv_get_msr_common(vcpu,
2756                                              msr_info->index, &msr_info->data,
2757                                              msr_info->host_initiated);
2758                 break;
2759         case MSR_IA32_BBL_CR_CTL3:
2760                 /* This legacy MSR exists but isn't fully documented in current
2761                  * silicon.  It is however accessed by winxp in very narrow
2762                  * scenarios where it sets bit #19, itself documented as
2763                  * a "reserved" bit.  Best effort attempt to source coherent
2764                  * read data here should the balance of the register be
2765                  * interpreted by the guest:
2766                  *
2767                  * L2 cache control register 3: 64GB range, 256KB size,
2768                  * enabled, latency 0x1, configured
2769                  */
2770                 msr_info->data = 0xbe702111;
2771                 break;
2772         case MSR_AMD64_OSVW_ID_LENGTH:
2773                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2774                         return 1;
2775                 msr_info->data = vcpu->arch.osvw.length;
2776                 break;
2777         case MSR_AMD64_OSVW_STATUS:
2778                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2779                         return 1;
2780                 msr_info->data = vcpu->arch.osvw.status;
2781                 break;
2782         case MSR_PLATFORM_INFO:
2783                 msr_info->data = vcpu->arch.msr_platform_info;
2784                 break;
2785         case MSR_MISC_FEATURES_ENABLES:
2786                 msr_info->data = vcpu->arch.msr_misc_features_enables;
2787                 break;
2788         default:
2789                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2790                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2791                 if (!ignore_msrs) {
2792                         vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2793                                                msr_info->index);
2794                         return 1;
2795                 } else {
2796                         if (report_ignored_msrs)
2797                                 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2798                                         msr_info->index);
2799                         msr_info->data = 0;
2800                 }
2801                 break;
2802         }
2803         return 0;
2804 }
2805 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2806
2807 /*
2808  * Read or write a bunch of msrs. All parameters are kernel addresses.
2809  *
2810  * @return number of msrs set successfully.
2811  */
2812 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2813                     struct kvm_msr_entry *entries,
2814                     int (*do_msr)(struct kvm_vcpu *vcpu,
2815                                   unsigned index, u64 *data))
2816 {
2817         int i;
2818
2819         for (i = 0; i < msrs->nmsrs; ++i)
2820                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2821                         break;
2822
2823         return i;
2824 }
2825
2826 /*
2827  * Read or write a bunch of msrs. Parameters are user addresses.
2828  *
2829  * @return number of msrs set successfully.
2830  */
2831 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2832                   int (*do_msr)(struct kvm_vcpu *vcpu,
2833                                 unsigned index, u64 *data),
2834                   int writeback)
2835 {
2836         struct kvm_msrs msrs;
2837         struct kvm_msr_entry *entries;
2838         int r, n;
2839         unsigned size;
2840
2841         r = -EFAULT;
2842         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2843                 goto out;
2844
2845         r = -E2BIG;
2846         if (msrs.nmsrs >= MAX_IO_MSRS)
2847                 goto out;
2848
2849         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2850         entries = memdup_user(user_msrs->entries, size);
2851         if (IS_ERR(entries)) {
2852                 r = PTR_ERR(entries);
2853                 goto out;
2854         }
2855
2856         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2857         if (r < 0)
2858                 goto out_free;
2859
2860         r = -EFAULT;
2861         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2862                 goto out_free;
2863
2864         r = n;
2865
2866 out_free:
2867         kfree(entries);
2868 out:
2869         return r;
2870 }
2871
2872 static inline bool kvm_can_mwait_in_guest(void)
2873 {
2874         return boot_cpu_has(X86_FEATURE_MWAIT) &&
2875                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
2876                 boot_cpu_has(X86_FEATURE_ARAT);
2877 }
2878
2879 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2880 {
2881         int r = 0;
2882
2883         switch (ext) {
2884         case KVM_CAP_IRQCHIP:
2885         case KVM_CAP_HLT:
2886         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2887         case KVM_CAP_SET_TSS_ADDR:
2888         case KVM_CAP_EXT_CPUID:
2889         case KVM_CAP_EXT_EMUL_CPUID:
2890         case KVM_CAP_CLOCKSOURCE:
2891         case KVM_CAP_PIT:
2892         case KVM_CAP_NOP_IO_DELAY:
2893         case KVM_CAP_MP_STATE:
2894         case KVM_CAP_SYNC_MMU:
2895         case KVM_CAP_USER_NMI:
2896         case KVM_CAP_REINJECT_CONTROL:
2897         case KVM_CAP_IRQ_INJECT_STATUS:
2898         case KVM_CAP_IOEVENTFD:
2899         case KVM_CAP_IOEVENTFD_NO_LENGTH:
2900         case KVM_CAP_PIT2:
2901         case KVM_CAP_PIT_STATE2:
2902         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2903         case KVM_CAP_XEN_HVM:
2904         case KVM_CAP_VCPU_EVENTS:
2905         case KVM_CAP_HYPERV:
2906         case KVM_CAP_HYPERV_VAPIC:
2907         case KVM_CAP_HYPERV_SPIN:
2908         case KVM_CAP_HYPERV_SYNIC:
2909         case KVM_CAP_HYPERV_SYNIC2:
2910         case KVM_CAP_HYPERV_VP_INDEX:
2911         case KVM_CAP_HYPERV_EVENTFD:
2912         case KVM_CAP_HYPERV_TLBFLUSH:
2913         case KVM_CAP_PCI_SEGMENT:
2914         case KVM_CAP_DEBUGREGS:
2915         case KVM_CAP_X86_ROBUST_SINGLESTEP:
2916         case KVM_CAP_XSAVE:
2917         case KVM_CAP_ASYNC_PF:
2918         case KVM_CAP_GET_TSC_KHZ:
2919         case KVM_CAP_KVMCLOCK_CTRL:
2920         case KVM_CAP_READONLY_MEM:
2921         case KVM_CAP_HYPERV_TIME:
2922         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2923         case KVM_CAP_TSC_DEADLINE_TIMER:
2924         case KVM_CAP_ENABLE_CAP_VM:
2925         case KVM_CAP_DISABLE_QUIRKS:
2926         case KVM_CAP_SET_BOOT_CPU_ID:
2927         case KVM_CAP_SPLIT_IRQCHIP:
2928         case KVM_CAP_IMMEDIATE_EXIT:
2929         case KVM_CAP_GET_MSR_FEATURES:
2930                 r = 1;
2931                 break;
2932         case KVM_CAP_SYNC_REGS:
2933                 r = KVM_SYNC_X86_VALID_FIELDS;
2934                 break;
2935         case KVM_CAP_ADJUST_CLOCK:
2936                 r = KVM_CLOCK_TSC_STABLE;
2937                 break;
2938         case KVM_CAP_X86_DISABLE_EXITS:
2939                 r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
2940                 if(kvm_can_mwait_in_guest())
2941                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
2942                 break;
2943         case KVM_CAP_X86_SMM:
2944                 /* SMBASE is usually relocated above 1M on modern chipsets,
2945                  * and SMM handlers might indeed rely on 4G segment limits,
2946                  * so do not report SMM to be available if real mode is
2947                  * emulated via vm86 mode.  Still, do not go to great lengths
2948                  * to avoid userspace's usage of the feature, because it is a
2949                  * fringe case that is not enabled except via specific settings
2950                  * of the module parameters.
2951                  */
2952                 r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
2953                 break;
2954         case KVM_CAP_VAPIC:
2955                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2956                 break;
2957         case KVM_CAP_NR_VCPUS:
2958                 r = KVM_SOFT_MAX_VCPUS;
2959                 break;
2960         case KVM_CAP_MAX_VCPUS:
2961                 r = KVM_MAX_VCPUS;
2962                 break;
2963         case KVM_CAP_NR_MEMSLOTS:
2964                 r = KVM_USER_MEM_SLOTS;
2965                 break;
2966         case KVM_CAP_PV_MMU:    /* obsolete */
2967                 r = 0;
2968                 break;
2969         case KVM_CAP_MCE:
2970                 r = KVM_MAX_MCE_BANKS;
2971                 break;
2972         case KVM_CAP_XCRS:
2973                 r = boot_cpu_has(X86_FEATURE_XSAVE);
2974                 break;
2975         case KVM_CAP_TSC_CONTROL:
2976                 r = kvm_has_tsc_control;
2977                 break;
2978         case KVM_CAP_X2APIC_API:
2979                 r = KVM_X2APIC_API_VALID_FLAGS;
2980                 break;
2981         case KVM_CAP_NESTED_STATE:
2982                 r = kvm_x86_ops->get_nested_state ?
2983                         kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
2984                 break;
2985         default:
2986                 break;
2987         }
2988         return r;
2989
2990 }
2991
2992 long kvm_arch_dev_ioctl(struct file *filp,
2993                         unsigned int ioctl, unsigned long arg)
2994 {
2995         void __user *argp = (void __user *)arg;
2996         long r;
2997
2998         switch (ioctl) {
2999         case KVM_GET_MSR_INDEX_LIST: {
3000                 struct kvm_msr_list __user *user_msr_list = argp;
3001                 struct kvm_msr_list msr_list;
3002                 unsigned n;
3003
3004                 r = -EFAULT;
3005                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3006                         goto out;
3007                 n = msr_list.nmsrs;
3008                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3009                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3010                         goto out;
3011                 r = -E2BIG;
3012                 if (n < msr_list.nmsrs)
3013                         goto out;
3014                 r = -EFAULT;
3015                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3016                                  num_msrs_to_save * sizeof(u32)))
3017                         goto out;
3018                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3019                                  &emulated_msrs,
3020                                  num_emulated_msrs * sizeof(u32)))
3021                         goto out;
3022                 r = 0;
3023                 break;
3024         }
3025         case KVM_GET_SUPPORTED_CPUID:
3026         case KVM_GET_EMULATED_CPUID: {
3027                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3028                 struct kvm_cpuid2 cpuid;
3029
3030                 r = -EFAULT;
3031                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3032                         goto out;
3033
3034                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3035                                             ioctl);
3036                 if (r)
3037                         goto out;
3038
3039                 r = -EFAULT;
3040                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3041                         goto out;
3042                 r = 0;
3043                 break;
3044         }
3045         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3046                 r = -EFAULT;
3047                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3048                                  sizeof(kvm_mce_cap_supported)))
3049                         goto out;
3050                 r = 0;
3051                 break;
3052         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3053                 struct kvm_msr_list __user *user_msr_list = argp;
3054                 struct kvm_msr_list msr_list;
3055                 unsigned int n;
3056
3057                 r = -EFAULT;
3058                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3059                         goto out;
3060                 n = msr_list.nmsrs;
3061                 msr_list.nmsrs = num_msr_based_features;
3062                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3063                         goto out;
3064                 r = -E2BIG;
3065                 if (n < msr_list.nmsrs)
3066                         goto out;
3067                 r = -EFAULT;
3068                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3069                                  num_msr_based_features * sizeof(u32)))
3070                         goto out;
3071                 r = 0;
3072                 break;
3073         }
3074         case KVM_GET_MSRS:
3075                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3076                 break;
3077         }
3078         default:
3079                 r = -EINVAL;
3080         }
3081 out:
3082         return r;
3083 }
3084
3085 static void wbinvd_ipi(void *garbage)
3086 {
3087         wbinvd();
3088 }
3089
3090 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3091 {
3092         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3093 }
3094
3095 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3096 {
3097         /* Address WBINVD may be executed by guest */
3098         if (need_emulate_wbinvd(vcpu)) {
3099                 if (kvm_x86_ops->has_wbinvd_exit())
3100                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3101                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3102                         smp_call_function_single(vcpu->cpu,
3103                                         wbinvd_ipi, NULL, 1);
3104         }
3105
3106         kvm_x86_ops->vcpu_load(vcpu, cpu);
3107
3108         /* Apply any externally detected TSC adjustments (due to suspend) */
3109         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3110                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3111                 vcpu->arch.tsc_offset_adjustment = 0;
3112                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3113         }
3114
3115         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3116                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3117                                 rdtsc() - vcpu->arch.last_host_tsc;
3118                 if (tsc_delta < 0)
3119                         mark_tsc_unstable("KVM discovered backwards TSC");
3120
3121                 if (kvm_check_tsc_unstable()) {
3122                         u64 offset = kvm_compute_tsc_offset(vcpu,
3123                                                 vcpu->arch.last_guest_tsc);
3124                         kvm_vcpu_write_tsc_offset(vcpu, offset);
3125                         vcpu->arch.tsc_catchup = 1;
3126                 }
3127
3128                 if (kvm_lapic_hv_timer_in_use(vcpu))
3129                         kvm_lapic_restart_hv_timer(vcpu);
3130
3131                 /*
3132                  * On a host with synchronized TSC, there is no need to update
3133                  * kvmclock on vcpu->cpu migration
3134                  */
3135                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3136                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3137                 if (vcpu->cpu != cpu)
3138                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3139                 vcpu->cpu = cpu;
3140         }
3141
3142         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3143 }
3144
3145 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3146 {
3147         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3148                 return;
3149
3150         vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3151
3152         kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3153                         &vcpu->arch.st.steal.preempted,
3154                         offsetof(struct kvm_steal_time, preempted),
3155                         sizeof(vcpu->arch.st.steal.preempted));
3156 }
3157
3158 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3159 {
3160         int idx;
3161
3162         if (vcpu->preempted)
3163                 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3164
3165         /*
3166          * Disable page faults because we're in atomic context here.
3167          * kvm_write_guest_offset_cached() would call might_fault()
3168          * that relies on pagefault_disable() to tell if there's a
3169          * bug. NOTE: the write to guest memory may not go through if
3170          * during postcopy live migration or if there's heavy guest
3171          * paging.
3172          */
3173         pagefault_disable();
3174         /*
3175          * kvm_memslots() will be called by
3176          * kvm_write_guest_offset_cached() so take the srcu lock.
3177          */
3178         idx = srcu_read_lock(&vcpu->kvm->srcu);
3179         kvm_steal_time_set_preempted(vcpu);
3180         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3181         pagefault_enable();
3182         kvm_x86_ops->vcpu_put(vcpu);
3183         vcpu->arch.last_host_tsc = rdtsc();
3184         /*
3185          * If userspace has set any breakpoints or watchpoints, dr6 is restored
3186          * on every vmexit, but if not, we might have a stale dr6 from the
3187          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3188          */
3189         set_debugreg(0, 6);
3190 }
3191
3192 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3193                                     struct kvm_lapic_state *s)
3194 {
3195         if (vcpu->arch.apicv_active)
3196                 kvm_x86_ops->sync_pir_to_irr(vcpu);
3197
3198         return kvm_apic_get_state(vcpu, s);
3199 }
3200
3201 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3202                                     struct kvm_lapic_state *s)
3203 {
3204         int r;
3205
3206         r = kvm_apic_set_state(vcpu, s);
3207         if (r)
3208                 return r;
3209         update_cr8_intercept(vcpu);
3210
3211         return 0;
3212 }
3213
3214 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3215 {
3216         return (!lapic_in_kernel(vcpu) ||
3217                 kvm_apic_accept_pic_intr(vcpu));
3218 }
3219
3220 /*
3221  * if userspace requested an interrupt window, check that the
3222  * interrupt window is open.
3223  *
3224  * No need to exit to userspace if we already have an interrupt queued.
3225  */
3226 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3227 {
3228         return kvm_arch_interrupt_allowed(vcpu) &&
3229                 !kvm_cpu_has_interrupt(vcpu) &&
3230                 !kvm_event_needs_reinjection(vcpu) &&
3231                 kvm_cpu_accept_dm_intr(vcpu);
3232 }
3233
3234 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3235                                     struct kvm_interrupt *irq)
3236 {
3237         if (irq->irq >= KVM_NR_INTERRUPTS)
3238                 return -EINVAL;
3239
3240         if (!irqchip_in_kernel(vcpu->kvm)) {
3241                 kvm_queue_interrupt(vcpu, irq->irq, false);
3242                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3243                 return 0;
3244         }
3245
3246         /*
3247          * With in-kernel LAPIC, we only use this to inject EXTINT, so
3248          * fail for in-kernel 8259.
3249          */
3250         if (pic_in_kernel(vcpu->kvm))
3251                 return -ENXIO;
3252
3253         if (vcpu->arch.pending_external_vector != -1)
3254                 return -EEXIST;
3255
3256         vcpu->arch.pending_external_vector = irq->irq;
3257         kvm_make_request(KVM_REQ_EVENT, vcpu);
3258         return 0;
3259 }
3260
3261 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3262 {
3263         kvm_inject_nmi(vcpu);
3264
3265         return 0;
3266 }
3267
3268 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3269 {
3270         kvm_make_request(KVM_REQ_SMI, vcpu);
3271
3272         return 0;
3273 }
3274
3275 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3276                                            struct kvm_tpr_access_ctl *tac)
3277 {
3278         if (tac->flags)
3279                 return -EINVAL;
3280         vcpu->arch.tpr_access_reporting = !!tac->enabled;
3281         return 0;
3282 }
3283
3284 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3285                                         u64 mcg_cap)
3286 {
3287         int r;
3288         unsigned bank_num = mcg_cap & 0xff, bank;
3289
3290         r = -EINVAL;
3291         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3292                 goto out;
3293         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3294                 goto out;
3295         r = 0;
3296         vcpu->arch.mcg_cap = mcg_cap;
3297         /* Init IA32_MCG_CTL to all 1s */
3298         if (mcg_cap & MCG_CTL_P)
3299                 vcpu->arch.mcg_ctl = ~(u64)0;
3300         /* Init IA32_MCi_CTL to all 1s */
3301         for (bank = 0; bank < bank_num; bank++)
3302                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3303
3304         if (kvm_x86_ops->setup_mce)
3305                 kvm_x86_ops->setup_mce(vcpu);
3306 out:
3307         return r;
3308 }
3309
3310 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3311                                       struct kvm_x86_mce *mce)
3312 {
3313         u64 mcg_cap = vcpu->arch.mcg_cap;
3314         unsigned bank_num = mcg_cap & 0xff;
3315         u64 *banks = vcpu->arch.mce_banks;
3316
3317         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3318                 return -EINVAL;
3319         /*
3320          * if IA32_MCG_CTL is not all 1s, the uncorrected error
3321          * reporting is disabled
3322          */
3323         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3324             vcpu->arch.mcg_ctl != ~(u64)0)
3325                 return 0;
3326         banks += 4 * mce->bank;
3327         /*
3328          * if IA32_MCi_CTL is not all 1s, the uncorrected error
3329          * reporting is disabled for the bank
3330          */
3331         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3332                 return 0;
3333         if (mce->status & MCI_STATUS_UC) {
3334                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3335                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3336                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3337                         return 0;
3338                 }
3339                 if (banks[1] & MCI_STATUS_VAL)
3340                         mce->status |= MCI_STATUS_OVER;
3341                 banks[2] = mce->addr;
3342                 banks[3] = mce->misc;
3343                 vcpu->arch.mcg_status = mce->mcg_status;
3344                 banks[1] = mce->status;
3345                 kvm_queue_exception(vcpu, MC_VECTOR);
3346         } else if (!(banks[1] & MCI_STATUS_VAL)
3347                    || !(banks[1] & MCI_STATUS_UC)) {
3348                 if (banks[1] & MCI_STATUS_VAL)
3349                         mce->status |= MCI_STATUS_OVER;
3350                 banks[2] = mce->addr;
3351                 banks[3] = mce->misc;
3352                 banks[1] = mce->status;
3353         } else
3354                 banks[1] |= MCI_STATUS_OVER;
3355         return 0;
3356 }
3357
3358 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3359                                                struct kvm_vcpu_events *events)
3360 {
3361         process_nmi(vcpu);
3362         /*
3363          * FIXME: pass injected and pending separately.  This is only
3364          * needed for nested virtualization, whose state cannot be
3365          * migrated yet.  For now we can combine them.
3366          */
3367         events->exception.injected =
3368                 (vcpu->arch.exception.pending ||
3369                  vcpu->arch.exception.injected) &&
3370                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3371         events->exception.nr = vcpu->arch.exception.nr;
3372         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3373         events->exception.pad = 0;
3374         events->exception.error_code = vcpu->arch.exception.error_code;
3375
3376         events->interrupt.injected =
3377                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3378         events->interrupt.nr = vcpu->arch.interrupt.nr;
3379         events->interrupt.soft = 0;
3380         events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3381
3382         events->nmi.injected = vcpu->arch.nmi_injected;
3383         events->nmi.pending = vcpu->arch.nmi_pending != 0;
3384         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3385         events->nmi.pad = 0;
3386
3387         events->sipi_vector = 0; /* never valid when reporting to user space */
3388
3389         events->smi.smm = is_smm(vcpu);
3390         events->smi.pending = vcpu->arch.smi_pending;
3391         events->smi.smm_inside_nmi =
3392                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3393         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3394
3395         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3396                          | KVM_VCPUEVENT_VALID_SHADOW
3397                          | KVM_VCPUEVENT_VALID_SMM);
3398         memset(&events->reserved, 0, sizeof(events->reserved));
3399 }
3400
3401 static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
3402
3403 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3404                                               struct kvm_vcpu_events *events)
3405 {
3406         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3407                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3408                               | KVM_VCPUEVENT_VALID_SHADOW
3409                               | KVM_VCPUEVENT_VALID_SMM))
3410                 return -EINVAL;
3411
3412         if (events->exception.injected &&
3413             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3414              is_guest_mode(vcpu)))
3415                 return -EINVAL;
3416
3417         /* INITs are latched while in SMM */
3418         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3419             (events->smi.smm || events->smi.pending) &&
3420             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3421                 return -EINVAL;
3422
3423         process_nmi(vcpu);
3424         vcpu->arch.exception.injected = false;
3425         vcpu->arch.exception.pending = events->exception.injected;
3426         vcpu->arch.exception.nr = events->exception.nr;
3427         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3428         vcpu->arch.exception.error_code = events->exception.error_code;
3429
3430         vcpu->arch.interrupt.injected = events->interrupt.injected;
3431         vcpu->arch.interrupt.nr = events->interrupt.nr;
3432         vcpu->arch.interrupt.soft = events->interrupt.soft;
3433         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3434                 kvm_x86_ops->set_interrupt_shadow(vcpu,
3435                                                   events->interrupt.shadow);
3436
3437         vcpu->arch.nmi_injected = events->nmi.injected;
3438         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3439                 vcpu->arch.nmi_pending = events->nmi.pending;
3440         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3441
3442         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3443             lapic_in_kernel(vcpu))
3444                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3445
3446         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3447                 u32 hflags = vcpu->arch.hflags;
3448                 if (events->smi.smm)
3449                         hflags |= HF_SMM_MASK;
3450                 else
3451                         hflags &= ~HF_SMM_MASK;
3452                 kvm_set_hflags(vcpu, hflags);
3453
3454                 vcpu->arch.smi_pending = events->smi.pending;
3455
3456                 if (events->smi.smm) {
3457                         if (events->smi.smm_inside_nmi)
3458                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3459                         else
3460                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3461                         if (lapic_in_kernel(vcpu)) {
3462                                 if (events->smi.latched_init)
3463                                         set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3464                                 else
3465                                         clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3466                         }
3467                 }
3468         }
3469
3470         kvm_make_request(KVM_REQ_EVENT, vcpu);
3471
3472         return 0;
3473 }
3474
3475 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3476                                              struct kvm_debugregs *dbgregs)
3477 {
3478         unsigned long val;
3479
3480         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3481         kvm_get_dr(vcpu, 6, &val);
3482         dbgregs->dr6 = val;
3483         dbgregs->dr7 = vcpu->arch.dr7;
3484         dbgregs->flags = 0;
3485         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3486 }
3487
3488 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3489                                             struct kvm_debugregs *dbgregs)
3490 {
3491         if (dbgregs->flags)
3492                 return -EINVAL;
3493
3494         if (dbgregs->dr6 & ~0xffffffffull)
3495                 return -EINVAL;
3496         if (dbgregs->dr7 & ~0xffffffffull)
3497                 return -EINVAL;
3498
3499         memcpy(vc