kvm: fix typo in flag name
[muen/linux.git] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Amit Shah    <amit.shah@qumranet.com>
15  *   Ben-Ami Yassour <benami@il.ibm.com>
16  *
17  * This work is licensed under the terms of the GNU GPL, version 2.  See
18  * the COPYING file in the top-level directory.
19  *
20  */
21
22 #include <linux/kvm_host.h>
23 #include "irq.h"
24 #include "mmu.h"
25 #include "i8254.h"
26 #include "tss.h"
27 #include "kvm_cache_regs.h"
28 #include "x86.h"
29 #include "cpuid.h"
30 #include "pmu.h"
31 #include "hyperv.h"
32
33 #include <linux/clocksource.h>
34 #include <linux/interrupt.h>
35 #include <linux/kvm.h>
36 #include <linux/fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/export.h>
39 #include <linux/moduleparam.h>
40 #include <linux/mman.h>
41 #include <linux/highmem.h>
42 #include <linux/iommu.h>
43 #include <linux/intel-iommu.h>
44 #include <linux/cpufreq.h>
45 #include <linux/user-return-notifier.h>
46 #include <linux/srcu.h>
47 #include <linux/slab.h>
48 #include <linux/perf_event.h>
49 #include <linux/uaccess.h>
50 #include <linux/hash.h>
51 #include <linux/pci.h>
52 #include <linux/timekeeper_internal.h>
53 #include <linux/pvclock_gtod.h>
54 #include <linux/kvm_irqfd.h>
55 #include <linux/irqbypass.h>
56 #include <linux/sched/stat.h>
57 #include <linux/mem_encrypt.h>
58
59 #include <trace/events/kvm.h>
60
61 #include <asm/debugreg.h>
62 #include <asm/msr.h>
63 #include <asm/desc.h>
64 #include <asm/mce.h>
65 #include <linux/kernel_stat.h>
66 #include <asm/fpu/internal.h> /* Ugh! */
67 #include <asm/pvclock.h>
68 #include <asm/div64.h>
69 #include <asm/irq_remapping.h>
70 #include <asm/mshyperv.h>
71 #include <asm/hypervisor.h>
72
73 #define CREATE_TRACE_POINTS
74 #include "trace.h"
75
76 #define MAX_IO_MSRS 256
77 #define KVM_MAX_MCE_BANKS 32
78 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
79 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
80
81 #define emul_to_vcpu(ctxt) \
82         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
83
84 /* EFER defaults:
85  * - enable syscall per default because its emulated by KVM
86  * - enable LME and LMA per default on 64 bit KVM
87  */
88 #ifdef CONFIG_X86_64
89 static
90 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
91 #else
92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
93 #endif
94
95 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
96 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
97
98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
99                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
100
101 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102 static void process_nmi(struct kvm_vcpu *vcpu);
103 static void enter_smm(struct kvm_vcpu *vcpu);
104 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105 static void store_regs(struct kvm_vcpu *vcpu);
106 static int sync_regs(struct kvm_vcpu *vcpu);
107
108 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
109 EXPORT_SYMBOL_GPL(kvm_x86_ops);
110
111 static bool __read_mostly ignore_msrs = 0;
112 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
113
114 static bool __read_mostly report_ignored_msrs = true;
115 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
116
117 unsigned int min_timer_period_us = 200;
118 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
119
120 static bool __read_mostly kvmclock_periodic_sync = true;
121 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
122
123 bool __read_mostly kvm_has_tsc_control;
124 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
125 u32  __read_mostly kvm_max_guest_tsc_khz;
126 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
127 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
128 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
129 u64  __read_mostly kvm_max_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
131 u64 __read_mostly kvm_default_tsc_scaling_ratio;
132 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
133
134 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
135 static u32 __read_mostly tsc_tolerance_ppm = 250;
136 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
137
138 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
139 unsigned int __read_mostly lapic_timer_advance_ns = 0;
140 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
141 EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
142
143 static bool __read_mostly vector_hashing = true;
144 module_param(vector_hashing, bool, S_IRUGO);
145
146 bool __read_mostly enable_vmware_backdoor = false;
147 module_param(enable_vmware_backdoor, bool, S_IRUGO);
148 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
149
150 static bool __read_mostly force_emulation_prefix = false;
151 module_param(force_emulation_prefix, bool, S_IRUGO);
152
153 #define KVM_NR_SHARED_MSRS 16
154
155 struct kvm_shared_msrs_global {
156         int nr;
157         u32 msrs[KVM_NR_SHARED_MSRS];
158 };
159
160 struct kvm_shared_msrs {
161         struct user_return_notifier urn;
162         bool registered;
163         struct kvm_shared_msr_values {
164                 u64 host;
165                 u64 curr;
166         } values[KVM_NR_SHARED_MSRS];
167 };
168
169 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
170 static struct kvm_shared_msrs __percpu *shared_msrs;
171
172 struct kvm_stats_debugfs_item debugfs_entries[] = {
173         { "pf_fixed", VCPU_STAT(pf_fixed) },
174         { "pf_guest", VCPU_STAT(pf_guest) },
175         { "tlb_flush", VCPU_STAT(tlb_flush) },
176         { "invlpg", VCPU_STAT(invlpg) },
177         { "exits", VCPU_STAT(exits) },
178         { "io_exits", VCPU_STAT(io_exits) },
179         { "mmio_exits", VCPU_STAT(mmio_exits) },
180         { "signal_exits", VCPU_STAT(signal_exits) },
181         { "irq_window", VCPU_STAT(irq_window_exits) },
182         { "nmi_window", VCPU_STAT(nmi_window_exits) },
183         { "halt_exits", VCPU_STAT(halt_exits) },
184         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
185         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
186         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
187         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
188         { "hypercalls", VCPU_STAT(hypercalls) },
189         { "request_irq", VCPU_STAT(request_irq_exits) },
190         { "irq_exits", VCPU_STAT(irq_exits) },
191         { "host_state_reload", VCPU_STAT(host_state_reload) },
192         { "fpu_reload", VCPU_STAT(fpu_reload) },
193         { "insn_emulation", VCPU_STAT(insn_emulation) },
194         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
195         { "irq_injections", VCPU_STAT(irq_injections) },
196         { "nmi_injections", VCPU_STAT(nmi_injections) },
197         { "req_event", VCPU_STAT(req_event) },
198         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
199         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
200         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
201         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
202         { "mmu_flooded", VM_STAT(mmu_flooded) },
203         { "mmu_recycled", VM_STAT(mmu_recycled) },
204         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
205         { "mmu_unsync", VM_STAT(mmu_unsync) },
206         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
207         { "largepages", VM_STAT(lpages) },
208         { "max_mmu_page_hash_collisions",
209                 VM_STAT(max_mmu_page_hash_collisions) },
210         { NULL }
211 };
212
213 u64 __read_mostly host_xcr0;
214
215 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
216
217 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
218 {
219         int i;
220         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
221                 vcpu->arch.apf.gfns[i] = ~0;
222 }
223
224 static void kvm_on_user_return(struct user_return_notifier *urn)
225 {
226         unsigned slot;
227         struct kvm_shared_msrs *locals
228                 = container_of(urn, struct kvm_shared_msrs, urn);
229         struct kvm_shared_msr_values *values;
230         unsigned long flags;
231
232         /*
233          * Disabling irqs at this point since the following code could be
234          * interrupted and executed through kvm_arch_hardware_disable()
235          */
236         local_irq_save(flags);
237         if (locals->registered) {
238                 locals->registered = false;
239                 user_return_notifier_unregister(urn);
240         }
241         local_irq_restore(flags);
242         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
243                 values = &locals->values[slot];
244                 if (values->host != values->curr) {
245                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
246                         values->curr = values->host;
247                 }
248         }
249 }
250
251 static void shared_msr_update(unsigned slot, u32 msr)
252 {
253         u64 value;
254         unsigned int cpu = smp_processor_id();
255         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
256
257         /* only read, and nobody should modify it at this time,
258          * so don't need lock */
259         if (slot >= shared_msrs_global.nr) {
260                 printk(KERN_ERR "kvm: invalid MSR slot!");
261                 return;
262         }
263         rdmsrl_safe(msr, &value);
264         smsr->values[slot].host = value;
265         smsr->values[slot].curr = value;
266 }
267
268 void kvm_define_shared_msr(unsigned slot, u32 msr)
269 {
270         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
271         shared_msrs_global.msrs[slot] = msr;
272         if (slot >= shared_msrs_global.nr)
273                 shared_msrs_global.nr = slot + 1;
274 }
275 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
276
277 static void kvm_shared_msr_cpu_online(void)
278 {
279         unsigned i;
280
281         for (i = 0; i < shared_msrs_global.nr; ++i)
282                 shared_msr_update(i, shared_msrs_global.msrs[i]);
283 }
284
285 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
286 {
287         unsigned int cpu = smp_processor_id();
288         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
289         int err;
290
291         if (((value ^ smsr->values[slot].curr) & mask) == 0)
292                 return 0;
293         smsr->values[slot].curr = value;
294         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
295         if (err)
296                 return 1;
297
298         if (!smsr->registered) {
299                 smsr->urn.on_user_return = kvm_on_user_return;
300                 user_return_notifier_register(&smsr->urn);
301                 smsr->registered = true;
302         }
303         return 0;
304 }
305 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
306
307 static void drop_user_return_notifiers(void)
308 {
309         unsigned int cpu = smp_processor_id();
310         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
311
312         if (smsr->registered)
313                 kvm_on_user_return(&smsr->urn);
314 }
315
316 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
317 {
318         return vcpu->arch.apic_base;
319 }
320 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
321
322 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
323 {
324         return kvm_apic_mode(kvm_get_apic_base(vcpu));
325 }
326 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
327
328 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
329 {
330         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
331         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
332         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
333                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
334
335         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
336                 return 1;
337         if (!msr_info->host_initiated) {
338                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
339                         return 1;
340                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
341                         return 1;
342         }
343
344         kvm_lapic_set_base(vcpu, msr_info->data);
345         return 0;
346 }
347 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
348
349 asmlinkage __visible void kvm_spurious_fault(void)
350 {
351         /* Fault while not rebooting.  We want the trace. */
352         BUG();
353 }
354 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
355
356 #define EXCPT_BENIGN            0
357 #define EXCPT_CONTRIBUTORY      1
358 #define EXCPT_PF                2
359
360 static int exception_class(int vector)
361 {
362         switch (vector) {
363         case PF_VECTOR:
364                 return EXCPT_PF;
365         case DE_VECTOR:
366         case TS_VECTOR:
367         case NP_VECTOR:
368         case SS_VECTOR:
369         case GP_VECTOR:
370                 return EXCPT_CONTRIBUTORY;
371         default:
372                 break;
373         }
374         return EXCPT_BENIGN;
375 }
376
377 #define EXCPT_FAULT             0
378 #define EXCPT_TRAP              1
379 #define EXCPT_ABORT             2
380 #define EXCPT_INTERRUPT         3
381
382 static int exception_type(int vector)
383 {
384         unsigned int mask;
385
386         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
387                 return EXCPT_INTERRUPT;
388
389         mask = 1 << vector;
390
391         /* #DB is trap, as instruction watchpoints are handled elsewhere */
392         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
393                 return EXCPT_TRAP;
394
395         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
396                 return EXCPT_ABORT;
397
398         /* Reserved exceptions will result in fault */
399         return EXCPT_FAULT;
400 }
401
402 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
403                 unsigned nr, bool has_error, u32 error_code,
404                 bool reinject)
405 {
406         u32 prev_nr;
407         int class1, class2;
408
409         kvm_make_request(KVM_REQ_EVENT, vcpu);
410
411         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
412         queue:
413                 if (has_error && !is_protmode(vcpu))
414                         has_error = false;
415                 if (reinject) {
416                         /*
417                          * On vmentry, vcpu->arch.exception.pending is only
418                          * true if an event injection was blocked by
419                          * nested_run_pending.  In that case, however,
420                          * vcpu_enter_guest requests an immediate exit,
421                          * and the guest shouldn't proceed far enough to
422                          * need reinjection.
423                          */
424                         WARN_ON_ONCE(vcpu->arch.exception.pending);
425                         vcpu->arch.exception.injected = true;
426                 } else {
427                         vcpu->arch.exception.pending = true;
428                         vcpu->arch.exception.injected = false;
429                 }
430                 vcpu->arch.exception.has_error_code = has_error;
431                 vcpu->arch.exception.nr = nr;
432                 vcpu->arch.exception.error_code = error_code;
433                 return;
434         }
435
436         /* to check exception */
437         prev_nr = vcpu->arch.exception.nr;
438         if (prev_nr == DF_VECTOR) {
439                 /* triple fault -> shutdown */
440                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
441                 return;
442         }
443         class1 = exception_class(prev_nr);
444         class2 = exception_class(nr);
445         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
446                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
447                 /*
448                  * Generate double fault per SDM Table 5-5.  Set
449                  * exception.pending = true so that the double fault
450                  * can trigger a nested vmexit.
451                  */
452                 vcpu->arch.exception.pending = true;
453                 vcpu->arch.exception.injected = false;
454                 vcpu->arch.exception.has_error_code = true;
455                 vcpu->arch.exception.nr = DF_VECTOR;
456                 vcpu->arch.exception.error_code = 0;
457         } else
458                 /* replace previous exception with a new one in a hope
459                    that instruction re-execution will regenerate lost
460                    exception */
461                 goto queue;
462 }
463
464 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
465 {
466         kvm_multiple_exception(vcpu, nr, false, 0, false);
467 }
468 EXPORT_SYMBOL_GPL(kvm_queue_exception);
469
470 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
471 {
472         kvm_multiple_exception(vcpu, nr, false, 0, true);
473 }
474 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
475
476 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
477 {
478         if (err)
479                 kvm_inject_gp(vcpu, 0);
480         else
481                 return kvm_skip_emulated_instruction(vcpu);
482
483         return 1;
484 }
485 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
486
487 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
488 {
489         ++vcpu->stat.pf_guest;
490         vcpu->arch.exception.nested_apf =
491                 is_guest_mode(vcpu) && fault->async_page_fault;
492         if (vcpu->arch.exception.nested_apf)
493                 vcpu->arch.apf.nested_apf_token = fault->address;
494         else
495                 vcpu->arch.cr2 = fault->address;
496         kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
497 }
498 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
499
500 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
501 {
502         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
503                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
504         else
505                 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
506
507         return fault->nested_page_fault;
508 }
509
510 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
511 {
512         atomic_inc(&vcpu->arch.nmi_queued);
513         kvm_make_request(KVM_REQ_NMI, vcpu);
514 }
515 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
516
517 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
518 {
519         kvm_multiple_exception(vcpu, nr, true, error_code, false);
520 }
521 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
522
523 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
524 {
525         kvm_multiple_exception(vcpu, nr, true, error_code, true);
526 }
527 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
528
529 /*
530  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
531  * a #GP and return false.
532  */
533 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
534 {
535         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
536                 return true;
537         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
538         return false;
539 }
540 EXPORT_SYMBOL_GPL(kvm_require_cpl);
541
542 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
543 {
544         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
545                 return true;
546
547         kvm_queue_exception(vcpu, UD_VECTOR);
548         return false;
549 }
550 EXPORT_SYMBOL_GPL(kvm_require_dr);
551
552 /*
553  * This function will be used to read from the physical memory of the currently
554  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
555  * can read from guest physical or from the guest's guest physical memory.
556  */
557 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
558                             gfn_t ngfn, void *data, int offset, int len,
559                             u32 access)
560 {
561         struct x86_exception exception;
562         gfn_t real_gfn;
563         gpa_t ngpa;
564
565         ngpa     = gfn_to_gpa(ngfn);
566         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
567         if (real_gfn == UNMAPPED_GVA)
568                 return -EFAULT;
569
570         real_gfn = gpa_to_gfn(real_gfn);
571
572         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
573 }
574 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
575
576 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
577                                void *data, int offset, int len, u32 access)
578 {
579         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
580                                        data, offset, len, access);
581 }
582
583 /*
584  * Load the pae pdptrs.  Return true is they are all valid.
585  */
586 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
587 {
588         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
589         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
590         int i;
591         int ret;
592         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
593
594         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
595                                       offset * sizeof(u64), sizeof(pdpte),
596                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
597         if (ret < 0) {
598                 ret = 0;
599                 goto out;
600         }
601         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
602                 if ((pdpte[i] & PT_PRESENT_MASK) &&
603                     (pdpte[i] &
604                      vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
605                         ret = 0;
606                         goto out;
607                 }
608         }
609         ret = 1;
610
611         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
612         __set_bit(VCPU_EXREG_PDPTR,
613                   (unsigned long *)&vcpu->arch.regs_avail);
614         __set_bit(VCPU_EXREG_PDPTR,
615                   (unsigned long *)&vcpu->arch.regs_dirty);
616 out:
617
618         return ret;
619 }
620 EXPORT_SYMBOL_GPL(load_pdptrs);
621
622 bool pdptrs_changed(struct kvm_vcpu *vcpu)
623 {
624         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
625         bool changed = true;
626         int offset;
627         gfn_t gfn;
628         int r;
629
630         if (is_long_mode(vcpu) || !is_pae(vcpu))
631                 return false;
632
633         if (!test_bit(VCPU_EXREG_PDPTR,
634                       (unsigned long *)&vcpu->arch.regs_avail))
635                 return true;
636
637         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
638         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
639         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
640                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
641         if (r < 0)
642                 goto out;
643         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
644 out:
645
646         return changed;
647 }
648 EXPORT_SYMBOL_GPL(pdptrs_changed);
649
650 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
651 {
652         unsigned long old_cr0 = kvm_read_cr0(vcpu);
653         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
654
655         cr0 |= X86_CR0_ET;
656
657 #ifdef CONFIG_X86_64
658         if (cr0 & 0xffffffff00000000UL)
659                 return 1;
660 #endif
661
662         cr0 &= ~CR0_RESERVED_BITS;
663
664         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
665                 return 1;
666
667         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
668                 return 1;
669
670         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
671 #ifdef CONFIG_X86_64
672                 if ((vcpu->arch.efer & EFER_LME)) {
673                         int cs_db, cs_l;
674
675                         if (!is_pae(vcpu))
676                                 return 1;
677                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
678                         if (cs_l)
679                                 return 1;
680                 } else
681 #endif
682                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
683                                                  kvm_read_cr3(vcpu)))
684                         return 1;
685         }
686
687         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
688                 return 1;
689
690         kvm_x86_ops->set_cr0(vcpu, cr0);
691
692         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
693                 kvm_clear_async_pf_completion_queue(vcpu);
694                 kvm_async_pf_hash_reset(vcpu);
695         }
696
697         if ((cr0 ^ old_cr0) & update_bits)
698                 kvm_mmu_reset_context(vcpu);
699
700         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
701             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
702             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
703                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
704
705         return 0;
706 }
707 EXPORT_SYMBOL_GPL(kvm_set_cr0);
708
709 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
710 {
711         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
712 }
713 EXPORT_SYMBOL_GPL(kvm_lmsw);
714
715 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
716 {
717         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
718                         !vcpu->guest_xcr0_loaded) {
719                 /* kvm_set_xcr() also depends on this */
720                 if (vcpu->arch.xcr0 != host_xcr0)
721                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
722                 vcpu->guest_xcr0_loaded = 1;
723         }
724 }
725
726 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
727 {
728         if (vcpu->guest_xcr0_loaded) {
729                 if (vcpu->arch.xcr0 != host_xcr0)
730                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
731                 vcpu->guest_xcr0_loaded = 0;
732         }
733 }
734
735 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
736 {
737         u64 xcr0 = xcr;
738         u64 old_xcr0 = vcpu->arch.xcr0;
739         u64 valid_bits;
740
741         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
742         if (index != XCR_XFEATURE_ENABLED_MASK)
743                 return 1;
744         if (!(xcr0 & XFEATURE_MASK_FP))
745                 return 1;
746         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
747                 return 1;
748
749         /*
750          * Do not allow the guest to set bits that we do not support
751          * saving.  However, xcr0 bit 0 is always set, even if the
752          * emulated CPU does not support XSAVE (see fx_init).
753          */
754         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
755         if (xcr0 & ~valid_bits)
756                 return 1;
757
758         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
759             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
760                 return 1;
761
762         if (xcr0 & XFEATURE_MASK_AVX512) {
763                 if (!(xcr0 & XFEATURE_MASK_YMM))
764                         return 1;
765                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
766                         return 1;
767         }
768         vcpu->arch.xcr0 = xcr0;
769
770         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
771                 kvm_update_cpuid(vcpu);
772         return 0;
773 }
774
775 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
776 {
777         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
778             __kvm_set_xcr(vcpu, index, xcr)) {
779                 kvm_inject_gp(vcpu, 0);
780                 return 1;
781         }
782         return 0;
783 }
784 EXPORT_SYMBOL_GPL(kvm_set_xcr);
785
786 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
787 {
788         unsigned long old_cr4 = kvm_read_cr4(vcpu);
789         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
790                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
791
792         if (cr4 & CR4_RESERVED_BITS)
793                 return 1;
794
795         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
796                 return 1;
797
798         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
799                 return 1;
800
801         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
802                 return 1;
803
804         if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
805                 return 1;
806
807         if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
808                 return 1;
809
810         if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
811                 return 1;
812
813         if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
814                 return 1;
815
816         if (is_long_mode(vcpu)) {
817                 if (!(cr4 & X86_CR4_PAE))
818                         return 1;
819         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
820                    && ((cr4 ^ old_cr4) & pdptr_bits)
821                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
822                                    kvm_read_cr3(vcpu)))
823                 return 1;
824
825         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
826                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
827                         return 1;
828
829                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
830                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
831                         return 1;
832         }
833
834         if (kvm_x86_ops->set_cr4(vcpu, cr4))
835                 return 1;
836
837         if (((cr4 ^ old_cr4) & pdptr_bits) ||
838             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
839                 kvm_mmu_reset_context(vcpu);
840
841         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
842                 kvm_update_cpuid(vcpu);
843
844         return 0;
845 }
846 EXPORT_SYMBOL_GPL(kvm_set_cr4);
847
848 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
849 {
850 #ifdef CONFIG_X86_64
851         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
852
853         if (pcid_enabled)
854                 cr3 &= ~CR3_PCID_INVD;
855 #endif
856
857         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
858                 kvm_mmu_sync_roots(vcpu);
859                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
860                 return 0;
861         }
862
863         if (is_long_mode(vcpu) &&
864             (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
865                 return 1;
866         else if (is_pae(vcpu) && is_paging(vcpu) &&
867                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
868                 return 1;
869
870         vcpu->arch.cr3 = cr3;
871         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
872         kvm_mmu_new_cr3(vcpu);
873         return 0;
874 }
875 EXPORT_SYMBOL_GPL(kvm_set_cr3);
876
877 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
878 {
879         if (cr8 & CR8_RESERVED_BITS)
880                 return 1;
881         if (lapic_in_kernel(vcpu))
882                 kvm_lapic_set_tpr(vcpu, cr8);
883         else
884                 vcpu->arch.cr8 = cr8;
885         return 0;
886 }
887 EXPORT_SYMBOL_GPL(kvm_set_cr8);
888
889 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
890 {
891         if (lapic_in_kernel(vcpu))
892                 return kvm_lapic_get_cr8(vcpu);
893         else
894                 return vcpu->arch.cr8;
895 }
896 EXPORT_SYMBOL_GPL(kvm_get_cr8);
897
898 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
899 {
900         int i;
901
902         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
903                 for (i = 0; i < KVM_NR_DB_REGS; i++)
904                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
905                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
906         }
907 }
908
909 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
910 {
911         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
912                 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
913 }
914
915 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
916 {
917         unsigned long dr7;
918
919         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
920                 dr7 = vcpu->arch.guest_debug_dr7;
921         else
922                 dr7 = vcpu->arch.dr7;
923         kvm_x86_ops->set_dr7(vcpu, dr7);
924         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
925         if (dr7 & DR7_BP_EN_MASK)
926                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
927 }
928
929 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
930 {
931         u64 fixed = DR6_FIXED_1;
932
933         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
934                 fixed |= DR6_RTM;
935         return fixed;
936 }
937
938 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
939 {
940         switch (dr) {
941         case 0 ... 3:
942                 vcpu->arch.db[dr] = val;
943                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
944                         vcpu->arch.eff_db[dr] = val;
945                 break;
946         case 4:
947                 /* fall through */
948         case 6:
949                 if (val & 0xffffffff00000000ULL)
950                         return -1; /* #GP */
951                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
952                 kvm_update_dr6(vcpu);
953                 break;
954         case 5:
955                 /* fall through */
956         default: /* 7 */
957                 if (val & 0xffffffff00000000ULL)
958                         return -1; /* #GP */
959                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
960                 kvm_update_dr7(vcpu);
961                 break;
962         }
963
964         return 0;
965 }
966
967 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
968 {
969         if (__kvm_set_dr(vcpu, dr, val)) {
970                 kvm_inject_gp(vcpu, 0);
971                 return 1;
972         }
973         return 0;
974 }
975 EXPORT_SYMBOL_GPL(kvm_set_dr);
976
977 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
978 {
979         switch (dr) {
980         case 0 ... 3:
981                 *val = vcpu->arch.db[dr];
982                 break;
983         case 4:
984                 /* fall through */
985         case 6:
986                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
987                         *val = vcpu->arch.dr6;
988                 else
989                         *val = kvm_x86_ops->get_dr6(vcpu);
990                 break;
991         case 5:
992                 /* fall through */
993         default: /* 7 */
994                 *val = vcpu->arch.dr7;
995                 break;
996         }
997         return 0;
998 }
999 EXPORT_SYMBOL_GPL(kvm_get_dr);
1000
1001 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1002 {
1003         u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1004         u64 data;
1005         int err;
1006
1007         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1008         if (err)
1009                 return err;
1010         kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1011         kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1012         return err;
1013 }
1014 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1015
1016 /*
1017  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1018  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1019  *
1020  * This list is modified at module load time to reflect the
1021  * capabilities of the host cpu. This capabilities test skips MSRs that are
1022  * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1023  * may depend on host virtualization features rather than host cpu features.
1024  */
1025
1026 static u32 msrs_to_save[] = {
1027         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1028         MSR_STAR,
1029 #ifdef CONFIG_X86_64
1030         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1031 #endif
1032         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1033         MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1034         MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1035 };
1036
1037 static unsigned num_msrs_to_save;
1038
1039 static u32 emulated_msrs[] = {
1040         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1041         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1042         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1043         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1044         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1045         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1046         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1047         HV_X64_MSR_RESET,
1048         HV_X64_MSR_VP_INDEX,
1049         HV_X64_MSR_VP_RUNTIME,
1050         HV_X64_MSR_SCONTROL,
1051         HV_X64_MSR_STIMER0_CONFIG,
1052         HV_X64_MSR_VP_ASSIST_PAGE,
1053         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1054         HV_X64_MSR_TSC_EMULATION_STATUS,
1055
1056         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1057         MSR_KVM_PV_EOI_EN,
1058
1059         MSR_IA32_TSC_ADJUST,
1060         MSR_IA32_TSCDEADLINE,
1061         MSR_IA32_MISC_ENABLE,
1062         MSR_IA32_MCG_STATUS,
1063         MSR_IA32_MCG_CTL,
1064         MSR_IA32_MCG_EXT_CTL,
1065         MSR_IA32_SMBASE,
1066         MSR_SMI_COUNT,
1067         MSR_PLATFORM_INFO,
1068         MSR_MISC_FEATURES_ENABLES,
1069 };
1070
1071 static unsigned num_emulated_msrs;
1072
1073 /*
1074  * List of msr numbers which are used to expose MSR-based features that
1075  * can be used by a hypervisor to validate requested CPU features.
1076  */
1077 static u32 msr_based_features[] = {
1078         MSR_IA32_VMX_BASIC,
1079         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1080         MSR_IA32_VMX_PINBASED_CTLS,
1081         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1082         MSR_IA32_VMX_PROCBASED_CTLS,
1083         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1084         MSR_IA32_VMX_EXIT_CTLS,
1085         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1086         MSR_IA32_VMX_ENTRY_CTLS,
1087         MSR_IA32_VMX_MISC,
1088         MSR_IA32_VMX_CR0_FIXED0,
1089         MSR_IA32_VMX_CR0_FIXED1,
1090         MSR_IA32_VMX_CR4_FIXED0,
1091         MSR_IA32_VMX_CR4_FIXED1,
1092         MSR_IA32_VMX_VMCS_ENUM,
1093         MSR_IA32_VMX_PROCBASED_CTLS2,
1094         MSR_IA32_VMX_EPT_VPID_CAP,
1095         MSR_IA32_VMX_VMFUNC,
1096
1097         MSR_F10H_DECFG,
1098         MSR_IA32_UCODE_REV,
1099 };
1100
1101 static unsigned int num_msr_based_features;
1102
1103 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1104 {
1105         switch (msr->index) {
1106         case MSR_IA32_UCODE_REV:
1107                 rdmsrl(msr->index, msr->data);
1108                 break;
1109         default:
1110                 if (kvm_x86_ops->get_msr_feature(msr))
1111                         return 1;
1112         }
1113         return 0;
1114 }
1115
1116 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1117 {
1118         struct kvm_msr_entry msr;
1119         int r;
1120
1121         msr.index = index;
1122         r = kvm_get_msr_feature(&msr);
1123         if (r)
1124                 return r;
1125
1126         *data = msr.data;
1127
1128         return 0;
1129 }
1130
1131 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1132 {
1133         if (efer & efer_reserved_bits)
1134                 return false;
1135
1136         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1137                         return false;
1138
1139         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1140                         return false;
1141
1142         return true;
1143 }
1144 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1145
1146 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
1147 {
1148         u64 old_efer = vcpu->arch.efer;
1149
1150         if (!kvm_valid_efer(vcpu, efer))
1151                 return 1;
1152
1153         if (is_paging(vcpu)
1154             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1155                 return 1;
1156
1157         efer &= ~EFER_LMA;
1158         efer |= vcpu->arch.efer & EFER_LMA;
1159
1160         kvm_x86_ops->set_efer(vcpu, efer);
1161
1162         /* Update reserved bits */
1163         if ((efer ^ old_efer) & EFER_NX)
1164                 kvm_mmu_reset_context(vcpu);
1165
1166         return 0;
1167 }
1168
1169 void kvm_enable_efer_bits(u64 mask)
1170 {
1171        efer_reserved_bits &= ~mask;
1172 }
1173 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1174
1175 /*
1176  * Writes msr value into into the appropriate "register".
1177  * Returns 0 on success, non-0 otherwise.
1178  * Assumes vcpu_load() was already called.
1179  */
1180 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1181 {
1182         switch (msr->index) {
1183         case MSR_FS_BASE:
1184         case MSR_GS_BASE:
1185         case MSR_KERNEL_GS_BASE:
1186         case MSR_CSTAR:
1187         case MSR_LSTAR:
1188                 if (is_noncanonical_address(msr->data, vcpu))
1189                         return 1;
1190                 break;
1191         case MSR_IA32_SYSENTER_EIP:
1192         case MSR_IA32_SYSENTER_ESP:
1193                 /*
1194                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1195                  * non-canonical address is written on Intel but not on
1196                  * AMD (which ignores the top 32-bits, because it does
1197                  * not implement 64-bit SYSENTER).
1198                  *
1199                  * 64-bit code should hence be able to write a non-canonical
1200                  * value on AMD.  Making the address canonical ensures that
1201                  * vmentry does not fail on Intel after writing a non-canonical
1202                  * value, and that something deterministic happens if the guest
1203                  * invokes 64-bit SYSENTER.
1204                  */
1205                 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1206         }
1207         return kvm_x86_ops->set_msr(vcpu, msr);
1208 }
1209 EXPORT_SYMBOL_GPL(kvm_set_msr);
1210
1211 /*
1212  * Adapt set_msr() to msr_io()'s calling convention
1213  */
1214 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1215 {
1216         struct msr_data msr;
1217         int r;
1218
1219         msr.index = index;
1220         msr.host_initiated = true;
1221         r = kvm_get_msr(vcpu, &msr);
1222         if (r)
1223                 return r;
1224
1225         *data = msr.data;
1226         return 0;
1227 }
1228
1229 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1230 {
1231         struct msr_data msr;
1232
1233         msr.data = *data;
1234         msr.index = index;
1235         msr.host_initiated = true;
1236         return kvm_set_msr(vcpu, &msr);
1237 }
1238
1239 #ifdef CONFIG_X86_64
1240 struct pvclock_gtod_data {
1241         seqcount_t      seq;
1242
1243         struct { /* extract of a clocksource struct */
1244                 int vclock_mode;
1245                 u64     cycle_last;
1246                 u64     mask;
1247                 u32     mult;
1248                 u32     shift;
1249         } clock;
1250
1251         u64             boot_ns;
1252         u64             nsec_base;
1253         u64             wall_time_sec;
1254 };
1255
1256 static struct pvclock_gtod_data pvclock_gtod_data;
1257
1258 static void update_pvclock_gtod(struct timekeeper *tk)
1259 {
1260         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1261         u64 boot_ns;
1262
1263         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1264
1265         write_seqcount_begin(&vdata->seq);
1266
1267         /* copy pvclock gtod data */
1268         vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
1269         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1270         vdata->clock.mask               = tk->tkr_mono.mask;
1271         vdata->clock.mult               = tk->tkr_mono.mult;
1272         vdata->clock.shift              = tk->tkr_mono.shift;
1273
1274         vdata->boot_ns                  = boot_ns;
1275         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
1276
1277         vdata->wall_time_sec            = tk->xtime_sec;
1278
1279         write_seqcount_end(&vdata->seq);
1280 }
1281 #endif
1282
1283 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1284 {
1285         /*
1286          * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1287          * vcpu_enter_guest.  This function is only called from
1288          * the physical CPU that is running vcpu.
1289          */
1290         kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1291 }
1292
1293 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1294 {
1295         int version;
1296         int r;
1297         struct pvclock_wall_clock wc;
1298         struct timespec64 boot;
1299
1300         if (!wall_clock)
1301                 return;
1302
1303         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1304         if (r)
1305                 return;
1306
1307         if (version & 1)
1308                 ++version;  /* first time write, random junk */
1309
1310         ++version;
1311
1312         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1313                 return;
1314
1315         /*
1316          * The guest calculates current wall clock time by adding
1317          * system time (updated by kvm_guest_time_update below) to the
1318          * wall clock specified here.  guest system time equals host
1319          * system time for us, thus we must fill in host boot time here.
1320          */
1321         getboottime64(&boot);
1322
1323         if (kvm->arch.kvmclock_offset) {
1324                 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1325                 boot = timespec64_sub(boot, ts);
1326         }
1327         wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1328         wc.nsec = boot.tv_nsec;
1329         wc.version = version;
1330
1331         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1332
1333         version++;
1334         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1335 }
1336
1337 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1338 {
1339         do_shl32_div32(dividend, divisor);
1340         return dividend;
1341 }
1342
1343 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1344                                s8 *pshift, u32 *pmultiplier)
1345 {
1346         uint64_t scaled64;
1347         int32_t  shift = 0;
1348         uint64_t tps64;
1349         uint32_t tps32;
1350
1351         tps64 = base_hz;
1352         scaled64 = scaled_hz;
1353         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1354                 tps64 >>= 1;
1355                 shift--;
1356         }
1357
1358         tps32 = (uint32_t)tps64;
1359         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1360                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1361                         scaled64 >>= 1;
1362                 else
1363                         tps32 <<= 1;
1364                 shift++;
1365         }
1366
1367         *pshift = shift;
1368         *pmultiplier = div_frac(scaled64, tps32);
1369
1370         pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1371                  __func__, base_hz, scaled_hz, shift, *pmultiplier);
1372 }
1373
1374 #ifdef CONFIG_X86_64
1375 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1376 #endif
1377
1378 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1379 static unsigned long max_tsc_khz;
1380
1381 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1382 {
1383         u64 v = (u64)khz * (1000000 + ppm);
1384         do_div(v, 1000000);
1385         return v;
1386 }
1387
1388 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1389 {
1390         u64 ratio;
1391
1392         /* Guest TSC same frequency as host TSC? */
1393         if (!scale) {
1394                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1395                 return 0;
1396         }
1397
1398         /* TSC scaling supported? */
1399         if (!kvm_has_tsc_control) {
1400                 if (user_tsc_khz > tsc_khz) {
1401                         vcpu->arch.tsc_catchup = 1;
1402                         vcpu->arch.tsc_always_catchup = 1;
1403                         return 0;
1404                 } else {
1405                         WARN(1, "user requested TSC rate below hardware speed\n");
1406                         return -1;
1407                 }
1408         }
1409
1410         /* TSC scaling required  - calculate ratio */
1411         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1412                                 user_tsc_khz, tsc_khz);
1413
1414         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1415                 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1416                           user_tsc_khz);
1417                 return -1;
1418         }
1419
1420         vcpu->arch.tsc_scaling_ratio = ratio;
1421         return 0;
1422 }
1423
1424 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1425 {
1426         u32 thresh_lo, thresh_hi;
1427         int use_scaling = 0;
1428
1429         /* tsc_khz can be zero if TSC calibration fails */
1430         if (user_tsc_khz == 0) {
1431                 /* set tsc_scaling_ratio to a safe value */
1432                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1433                 return -1;
1434         }
1435
1436         /* Compute a scale to convert nanoseconds in TSC cycles */
1437         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1438                            &vcpu->arch.virtual_tsc_shift,
1439                            &vcpu->arch.virtual_tsc_mult);
1440         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1441
1442         /*
1443          * Compute the variation in TSC rate which is acceptable
1444          * within the range of tolerance and decide if the
1445          * rate being applied is within that bounds of the hardware
1446          * rate.  If so, no scaling or compensation need be done.
1447          */
1448         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1449         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1450         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1451                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1452                 use_scaling = 1;
1453         }
1454         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1455 }
1456
1457 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1458 {
1459         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1460                                       vcpu->arch.virtual_tsc_mult,
1461                                       vcpu->arch.virtual_tsc_shift);
1462         tsc += vcpu->arch.this_tsc_write;
1463         return tsc;
1464 }
1465
1466 static inline int gtod_is_based_on_tsc(int mode)
1467 {
1468         return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1469 }
1470
1471 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1472 {
1473 #ifdef CONFIG_X86_64
1474         bool vcpus_matched;
1475         struct kvm_arch *ka = &vcpu->kvm->arch;
1476         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1477
1478         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1479                          atomic_read(&vcpu->kvm->online_vcpus));
1480
1481         /*
1482          * Once the masterclock is enabled, always perform request in
1483          * order to update it.
1484          *
1485          * In order to enable masterclock, the host clocksource must be TSC
1486          * and the vcpus need to have matched TSCs.  When that happens,
1487          * perform request to enable masterclock.
1488          */
1489         if (ka->use_master_clock ||
1490             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1491                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1492
1493         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1494                             atomic_read(&vcpu->kvm->online_vcpus),
1495                             ka->use_master_clock, gtod->clock.vclock_mode);
1496 #endif
1497 }
1498
1499 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1500 {
1501         u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1502         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1503 }
1504
1505 /*
1506  * Multiply tsc by a fixed point number represented by ratio.
1507  *
1508  * The most significant 64-N bits (mult) of ratio represent the
1509  * integral part of the fixed point number; the remaining N bits
1510  * (frac) represent the fractional part, ie. ratio represents a fixed
1511  * point number (mult + frac * 2^(-N)).
1512  *
1513  * N equals to kvm_tsc_scaling_ratio_frac_bits.
1514  */
1515 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1516 {
1517         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1518 }
1519
1520 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1521 {
1522         u64 _tsc = tsc;
1523         u64 ratio = vcpu->arch.tsc_scaling_ratio;
1524
1525         if (ratio != kvm_default_tsc_scaling_ratio)
1526                 _tsc = __scale_tsc(ratio, tsc);
1527
1528         return _tsc;
1529 }
1530 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1531
1532 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1533 {
1534         u64 tsc;
1535
1536         tsc = kvm_scale_tsc(vcpu, rdtsc());
1537
1538         return target_tsc - tsc;
1539 }
1540
1541 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1542 {
1543         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1544
1545         return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1546 }
1547 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1548
1549 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1550 {
1551         kvm_x86_ops->write_tsc_offset(vcpu, offset);
1552         vcpu->arch.tsc_offset = offset;
1553 }
1554
1555 static inline bool kvm_check_tsc_unstable(void)
1556 {
1557 #ifdef CONFIG_X86_64
1558         /*
1559          * TSC is marked unstable when we're running on Hyper-V,
1560          * 'TSC page' clocksource is good.
1561          */
1562         if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1563                 return false;
1564 #endif
1565         return check_tsc_unstable();
1566 }
1567
1568 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1569 {
1570         struct kvm *kvm = vcpu->kvm;
1571         u64 offset, ns, elapsed;
1572         unsigned long flags;
1573         bool matched;
1574         bool already_matched;
1575         u64 data = msr->data;
1576         bool synchronizing = false;
1577
1578         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1579         offset = kvm_compute_tsc_offset(vcpu, data);
1580         ns = ktime_get_boot_ns();
1581         elapsed = ns - kvm->arch.last_tsc_nsec;
1582
1583         if (vcpu->arch.virtual_tsc_khz) {
1584                 if (data == 0 && msr->host_initiated) {
1585                         /*
1586                          * detection of vcpu initialization -- need to sync
1587                          * with other vCPUs. This particularly helps to keep
1588                          * kvm_clock stable after CPU hotplug
1589                          */
1590                         synchronizing = true;
1591                 } else {
1592                         u64 tsc_exp = kvm->arch.last_tsc_write +
1593                                                 nsec_to_cycles(vcpu, elapsed);
1594                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1595                         /*
1596                          * Special case: TSC write with a small delta (1 second)
1597                          * of virtual cycle time against real time is
1598                          * interpreted as an attempt to synchronize the CPU.
1599                          */
1600                         synchronizing = data < tsc_exp + tsc_hz &&
1601                                         data + tsc_hz > tsc_exp;
1602                 }
1603         }
1604
1605         /*
1606          * For a reliable TSC, we can match TSC offsets, and for an unstable
1607          * TSC, we add elapsed time in this computation.  We could let the
1608          * compensation code attempt to catch up if we fall behind, but
1609          * it's better to try to match offsets from the beginning.
1610          */
1611         if (synchronizing &&
1612             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1613                 if (!kvm_check_tsc_unstable()) {
1614                         offset = kvm->arch.cur_tsc_offset;
1615                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1616                 } else {
1617                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1618                         data += delta;
1619                         offset = kvm_compute_tsc_offset(vcpu, data);
1620                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1621                 }
1622                 matched = true;
1623                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1624         } else {
1625                 /*
1626                  * We split periods of matched TSC writes into generations.
1627                  * For each generation, we track the original measured
1628                  * nanosecond time, offset, and write, so if TSCs are in
1629                  * sync, we can match exact offset, and if not, we can match
1630                  * exact software computation in compute_guest_tsc()
1631                  *
1632                  * These values are tracked in kvm->arch.cur_xxx variables.
1633                  */
1634                 kvm->arch.cur_tsc_generation++;
1635                 kvm->arch.cur_tsc_nsec = ns;
1636                 kvm->arch.cur_tsc_write = data;
1637                 kvm->arch.cur_tsc_offset = offset;
1638                 matched = false;
1639                 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1640                          kvm->arch.cur_tsc_generation, data);
1641         }
1642
1643         /*
1644          * We also track th most recent recorded KHZ, write and time to
1645          * allow the matching interval to be extended at each write.
1646          */
1647         kvm->arch.last_tsc_nsec = ns;
1648         kvm->arch.last_tsc_write = data;
1649         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1650
1651         vcpu->arch.last_guest_tsc = data;
1652
1653         /* Keep track of which generation this VCPU has synchronized to */
1654         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1655         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1656         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1657
1658         if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1659                 update_ia32_tsc_adjust_msr(vcpu, offset);
1660
1661         kvm_vcpu_write_tsc_offset(vcpu, offset);
1662         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1663
1664         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1665         if (!matched) {
1666                 kvm->arch.nr_vcpus_matched_tsc = 0;
1667         } else if (!already_matched) {
1668                 kvm->arch.nr_vcpus_matched_tsc++;
1669         }
1670
1671         kvm_track_tsc_matching(vcpu);
1672         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1673 }
1674
1675 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1676
1677 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1678                                            s64 adjustment)
1679 {
1680         kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
1681 }
1682
1683 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1684 {
1685         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1686                 WARN_ON(adjustment < 0);
1687         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1688         adjust_tsc_offset_guest(vcpu, adjustment);
1689 }
1690
1691 #ifdef CONFIG_X86_64
1692
1693 static u64 read_tsc(void)
1694 {
1695         u64 ret = (u64)rdtsc_ordered();
1696         u64 last = pvclock_gtod_data.clock.cycle_last;
1697
1698         if (likely(ret >= last))
1699                 return ret;
1700
1701         /*
1702          * GCC likes to generate cmov here, but this branch is extremely
1703          * predictable (it's just a function of time and the likely is
1704          * very likely) and there's a data dependence, so force GCC
1705          * to generate a branch instead.  I don't barrier() because
1706          * we don't actually need a barrier, and if this function
1707          * ever gets inlined it will generate worse code.
1708          */
1709         asm volatile ("");
1710         return last;
1711 }
1712
1713 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1714 {
1715         long v;
1716         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1717         u64 tsc_pg_val;
1718
1719         switch (gtod->clock.vclock_mode) {
1720         case VCLOCK_HVCLOCK:
1721                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1722                                                   tsc_timestamp);
1723                 if (tsc_pg_val != U64_MAX) {
1724                         /* TSC page valid */
1725                         *mode = VCLOCK_HVCLOCK;
1726                         v = (tsc_pg_val - gtod->clock.cycle_last) &
1727                                 gtod->clock.mask;
1728                 } else {
1729                         /* TSC page invalid */
1730                         *mode = VCLOCK_NONE;
1731                 }
1732                 break;
1733         case VCLOCK_TSC:
1734                 *mode = VCLOCK_TSC;
1735                 *tsc_timestamp = read_tsc();
1736                 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1737                         gtod->clock.mask;
1738                 break;
1739         default:
1740                 *mode = VCLOCK_NONE;
1741         }
1742
1743         if (*mode == VCLOCK_NONE)
1744                 *tsc_timestamp = v = 0;
1745
1746         return v * gtod->clock.mult;
1747 }
1748
1749 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1750 {
1751         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1752         unsigned long seq;
1753         int mode;
1754         u64 ns;
1755
1756         do {
1757                 seq = read_seqcount_begin(&gtod->seq);
1758                 ns = gtod->nsec_base;
1759                 ns += vgettsc(tsc_timestamp, &mode);
1760                 ns >>= gtod->clock.shift;
1761                 ns += gtod->boot_ns;
1762         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1763         *t = ns;
1764
1765         return mode;
1766 }
1767
1768 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
1769 {
1770         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1771         unsigned long seq;
1772         int mode;
1773         u64 ns;
1774
1775         do {
1776                 seq = read_seqcount_begin(&gtod->seq);
1777                 ts->tv_sec = gtod->wall_time_sec;
1778                 ns = gtod->nsec_base;
1779                 ns += vgettsc(tsc_timestamp, &mode);
1780                 ns >>= gtod->clock.shift;
1781         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1782
1783         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1784         ts->tv_nsec = ns;
1785
1786         return mode;
1787 }
1788
1789 /* returns true if host is using TSC based clocksource */
1790 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1791 {
1792         /* checked again under seqlock below */
1793         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1794                 return false;
1795
1796         return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1797                                                       tsc_timestamp));
1798 }
1799
1800 /* returns true if host is using TSC based clocksource */
1801 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
1802                                            u64 *tsc_timestamp)
1803 {
1804         /* checked again under seqlock below */
1805         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1806                 return false;
1807
1808         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1809 }
1810 #endif
1811
1812 /*
1813  *
1814  * Assuming a stable TSC across physical CPUS, and a stable TSC
1815  * across virtual CPUs, the following condition is possible.
1816  * Each numbered line represents an event visible to both
1817  * CPUs at the next numbered event.
1818  *
1819  * "timespecX" represents host monotonic time. "tscX" represents
1820  * RDTSC value.
1821  *
1822  *              VCPU0 on CPU0           |       VCPU1 on CPU1
1823  *
1824  * 1.  read timespec0,tsc0
1825  * 2.                                   | timespec1 = timespec0 + N
1826  *                                      | tsc1 = tsc0 + M
1827  * 3. transition to guest               | transition to guest
1828  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1829  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1830  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1831  *
1832  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1833  *
1834  *      - ret0 < ret1
1835  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1836  *              ...
1837  *      - 0 < N - M => M < N
1838  *
1839  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1840  * always the case (the difference between two distinct xtime instances
1841  * might be smaller then the difference between corresponding TSC reads,
1842  * when updating guest vcpus pvclock areas).
1843  *
1844  * To avoid that problem, do not allow visibility of distinct
1845  * system_timestamp/tsc_timestamp values simultaneously: use a master
1846  * copy of host monotonic time values. Update that master copy
1847  * in lockstep.
1848  *
1849  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1850  *
1851  */
1852
1853 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1854 {
1855 #ifdef CONFIG_X86_64
1856         struct kvm_arch *ka = &kvm->arch;
1857         int vclock_mode;
1858         bool host_tsc_clocksource, vcpus_matched;
1859
1860         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1861                         atomic_read(&kvm->online_vcpus));
1862
1863         /*
1864          * If the host uses TSC clock, then passthrough TSC as stable
1865          * to the guest.
1866          */
1867         host_tsc_clocksource = kvm_get_time_and_clockread(
1868                                         &ka->master_kernel_ns,
1869                                         &ka->master_cycle_now);
1870
1871         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1872                                 && !ka->backwards_tsc_observed
1873                                 && !ka->boot_vcpu_runs_old_kvmclock;
1874
1875         if (ka->use_master_clock)
1876                 atomic_set(&kvm_guest_has_master_clock, 1);
1877
1878         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1879         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1880                                         vcpus_matched);
1881 #endif
1882 }
1883
1884 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
1885 {
1886         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
1887 }
1888
1889 static void kvm_gen_update_masterclock(struct kvm *kvm)
1890 {
1891 #ifdef CONFIG_X86_64
1892         int i;
1893         struct kvm_vcpu *vcpu;
1894         struct kvm_arch *ka = &kvm->arch;
1895
1896         spin_lock(&ka->pvclock_gtod_sync_lock);
1897         kvm_make_mclock_inprogress_request(kvm);
1898         /* no guest entries from this point */
1899         pvclock_update_vm_gtod_copy(kvm);
1900
1901         kvm_for_each_vcpu(i, vcpu, kvm)
1902                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1903
1904         /* guest entries allowed */
1905         kvm_for_each_vcpu(i, vcpu, kvm)
1906                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
1907
1908         spin_unlock(&ka->pvclock_gtod_sync_lock);
1909 #endif
1910 }
1911
1912 u64 get_kvmclock_ns(struct kvm *kvm)
1913 {
1914         struct kvm_arch *ka = &kvm->arch;
1915         struct pvclock_vcpu_time_info hv_clock;
1916         u64 ret;
1917
1918         spin_lock(&ka->pvclock_gtod_sync_lock);
1919         if (!ka->use_master_clock) {
1920                 spin_unlock(&ka->pvclock_gtod_sync_lock);
1921                 return ktime_get_boot_ns() + ka->kvmclock_offset;
1922         }
1923
1924         hv_clock.tsc_timestamp = ka->master_cycle_now;
1925         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
1926         spin_unlock(&ka->pvclock_gtod_sync_lock);
1927
1928         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
1929         get_cpu();
1930
1931         if (__this_cpu_read(cpu_tsc_khz)) {
1932                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
1933                                    &hv_clock.tsc_shift,
1934                                    &hv_clock.tsc_to_system_mul);
1935                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
1936         } else
1937                 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
1938
1939         put_cpu();
1940
1941         return ret;
1942 }
1943
1944 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1945 {
1946         struct kvm_vcpu_arch *vcpu = &v->arch;
1947         struct pvclock_vcpu_time_info guest_hv_clock;
1948
1949         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1950                 &guest_hv_clock, sizeof(guest_hv_clock))))
1951                 return;
1952
1953         /* This VCPU is paused, but it's legal for a guest to read another
1954          * VCPU's kvmclock, so we really have to follow the specification where
1955          * it says that version is odd if data is being modified, and even after
1956          * it is consistent.
1957          *
1958          * Version field updates must be kept separate.  This is because
1959          * kvm_write_guest_cached might use a "rep movs" instruction, and
1960          * writes within a string instruction are weakly ordered.  So there
1961          * are three writes overall.
1962          *
1963          * As a small optimization, only write the version field in the first
1964          * and third write.  The vcpu->pv_time cache is still valid, because the
1965          * version field is the first in the struct.
1966          */
1967         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
1968
1969         if (guest_hv_clock.version & 1)
1970                 ++guest_hv_clock.version;  /* first time write, random junk */
1971
1972         vcpu->hv_clock.version = guest_hv_clock.version + 1;
1973         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1974                                 &vcpu->hv_clock,
1975                                 sizeof(vcpu->hv_clock.version));
1976
1977         smp_wmb();
1978
1979         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1980         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
1981
1982         if (vcpu->pvclock_set_guest_stopped_request) {
1983                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
1984                 vcpu->pvclock_set_guest_stopped_request = false;
1985         }
1986
1987         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
1988
1989         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1990                                 &vcpu->hv_clock,
1991                                 sizeof(vcpu->hv_clock));
1992
1993         smp_wmb();
1994
1995         vcpu->hv_clock.version++;
1996         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1997                                 &vcpu->hv_clock,
1998                                 sizeof(vcpu->hv_clock.version));
1999 }
2000
2001 static int kvm_guest_time_update(struct kvm_vcpu *v)
2002 {
2003         unsigned long flags, tgt_tsc_khz;
2004         struct kvm_vcpu_arch *vcpu = &v->arch;
2005         struct kvm_arch *ka = &v->kvm->arch;
2006         s64 kernel_ns;
2007         u64 tsc_timestamp, host_tsc;
2008         u8 pvclock_flags;
2009         bool use_master_clock;
2010
2011         kernel_ns = 0;
2012         host_tsc = 0;
2013
2014         /*
2015          * If the host uses TSC clock, then passthrough TSC as stable
2016          * to the guest.
2017          */
2018         spin_lock(&ka->pvclock_gtod_sync_lock);
2019         use_master_clock = ka->use_master_clock;
2020         if (use_master_clock) {
2021                 host_tsc = ka->master_cycle_now;
2022                 kernel_ns = ka->master_kernel_ns;
2023         }
2024         spin_unlock(&ka->pvclock_gtod_sync_lock);
2025
2026         /* Keep irq disabled to prevent changes to the clock */
2027         local_irq_save(flags);
2028         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2029         if (unlikely(tgt_tsc_khz == 0)) {
2030                 local_irq_restore(flags);
2031                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2032                 return 1;
2033         }
2034         if (!use_master_clock) {
2035                 host_tsc = rdtsc();
2036                 kernel_ns = ktime_get_boot_ns();
2037         }
2038
2039         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2040
2041         /*
2042          * We may have to catch up the TSC to match elapsed wall clock
2043          * time for two reasons, even if kvmclock is used.
2044          *   1) CPU could have been running below the maximum TSC rate
2045          *   2) Broken TSC compensation resets the base at each VCPU
2046          *      entry to avoid unknown leaps of TSC even when running
2047          *      again on the same CPU.  This may cause apparent elapsed
2048          *      time to disappear, and the guest to stand still or run
2049          *      very slowly.
2050          */
2051         if (vcpu->tsc_catchup) {
2052                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2053                 if (tsc > tsc_timestamp) {
2054                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2055                         tsc_timestamp = tsc;
2056                 }
2057         }
2058
2059         local_irq_restore(flags);
2060
2061         /* With all the info we got, fill in the values */
2062
2063         if (kvm_has_tsc_control)
2064                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2065
2066         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2067                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2068                                    &vcpu->hv_clock.tsc_shift,
2069                                    &vcpu->hv_clock.tsc_to_system_mul);
2070                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2071         }
2072
2073         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2074         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2075         vcpu->last_guest_tsc = tsc_timestamp;
2076
2077         /* If the host uses TSC clocksource, then it is stable */
2078         pvclock_flags = 0;
2079         if (use_master_clock)
2080                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2081
2082         vcpu->hv_clock.flags = pvclock_flags;
2083
2084         if (vcpu->pv_time_enabled)
2085                 kvm_setup_pvclock_page(v);
2086         if (v == kvm_get_vcpu(v->kvm, 0))
2087                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2088         return 0;
2089 }
2090
2091 /*
2092  * kvmclock updates which are isolated to a given vcpu, such as
2093  * vcpu->cpu migration, should not allow system_timestamp from
2094  * the rest of the vcpus to remain static. Otherwise ntp frequency
2095  * correction applies to one vcpu's system_timestamp but not
2096  * the others.
2097  *
2098  * So in those cases, request a kvmclock update for all vcpus.
2099  * We need to rate-limit these requests though, as they can
2100  * considerably slow guests that have a large number of vcpus.
2101  * The time for a remote vcpu to update its kvmclock is bound
2102  * by the delay we use to rate-limit the updates.
2103  */
2104
2105 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2106
2107 static void kvmclock_update_fn(struct work_struct *work)
2108 {
2109         int i;
2110         struct delayed_work *dwork = to_delayed_work(work);
2111         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2112                                            kvmclock_update_work);
2113         struct kvm *kvm = container_of(ka, struct kvm, arch);
2114         struct kvm_vcpu *vcpu;
2115
2116         kvm_for_each_vcpu(i, vcpu, kvm) {
2117                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2118                 kvm_vcpu_kick(vcpu);
2119         }
2120 }
2121
2122 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2123 {
2124         struct kvm *kvm = v->kvm;
2125
2126         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2127         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2128                                         KVMCLOCK_UPDATE_DELAY);
2129 }
2130
2131 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2132
2133 static void kvmclock_sync_fn(struct work_struct *work)
2134 {
2135         struct delayed_work *dwork = to_delayed_work(work);
2136         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2137                                            kvmclock_sync_work);
2138         struct kvm *kvm = container_of(ka, struct kvm, arch);
2139
2140         if (!kvmclock_periodic_sync)
2141                 return;
2142
2143         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2144         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2145                                         KVMCLOCK_SYNC_PERIOD);
2146 }
2147
2148 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2149 {
2150         u64 mcg_cap = vcpu->arch.mcg_cap;
2151         unsigned bank_num = mcg_cap & 0xff;
2152         u32 msr = msr_info->index;
2153         u64 data = msr_info->data;
2154
2155         switch (msr) {
2156         case MSR_IA32_MCG_STATUS:
2157                 vcpu->arch.mcg_status = data;
2158                 break;
2159         case MSR_IA32_MCG_CTL:
2160                 if (!(mcg_cap & MCG_CTL_P))
2161                         return 1;
2162                 if (data != 0 && data != ~(u64)0)
2163                         return -1;
2164                 vcpu->arch.mcg_ctl = data;
2165                 break;
2166         default:
2167                 if (msr >= MSR_IA32_MC0_CTL &&
2168                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2169                         u32 offset = msr - MSR_IA32_MC0_CTL;
2170                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2171                          * some Linux kernels though clear bit 10 in bank 4 to
2172                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2173                          * this to avoid an uncatched #GP in the guest
2174                          */
2175                         if ((offset & 0x3) == 0 &&
2176                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2177                                 return -1;
2178                         if (!msr_info->host_initiated &&
2179                                 (offset & 0x3) == 1 && data != 0)
2180                                 return -1;
2181                         vcpu->arch.mce_banks[offset] = data;
2182                         break;
2183                 }
2184                 return 1;
2185         }
2186         return 0;
2187 }
2188
2189 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2190 {
2191         struct kvm *kvm = vcpu->kvm;
2192         int lm = is_long_mode(vcpu);
2193         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2194                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2195         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2196                 : kvm->arch.xen_hvm_config.blob_size_32;
2197         u32 page_num = data & ~PAGE_MASK;
2198         u64 page_addr = data & PAGE_MASK;
2199         u8 *page;
2200         int r;
2201
2202         r = -E2BIG;
2203         if (page_num >= blob_size)
2204                 goto out;
2205         r = -ENOMEM;
2206         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2207         if (IS_ERR(page)) {
2208                 r = PTR_ERR(page);
2209                 goto out;
2210         }
2211         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2212                 goto out_free;
2213         r = 0;
2214 out_free:
2215         kfree(page);
2216 out:
2217         return r;
2218 }
2219
2220 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2221 {
2222         gpa_t gpa = data & ~0x3f;
2223
2224         /* Bits 3:5 are reserved, Should be zero */
2225         if (data & 0x38)
2226                 return 1;
2227
2228         vcpu->arch.apf.msr_val = data;
2229
2230         if (!(data & KVM_ASYNC_PF_ENABLED)) {
2231                 kvm_clear_async_pf_completion_queue(vcpu);
2232                 kvm_async_pf_hash_reset(vcpu);
2233                 return 0;
2234         }
2235
2236         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2237                                         sizeof(u32)))
2238                 return 1;
2239
2240         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2241         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2242         kvm_async_pf_wakeup_all(vcpu);
2243         return 0;
2244 }
2245
2246 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2247 {
2248         vcpu->arch.pv_time_enabled = false;
2249 }
2250
2251 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2252 {
2253         ++vcpu->stat.tlb_flush;
2254         kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2255 }
2256
2257 static void record_steal_time(struct kvm_vcpu *vcpu)
2258 {
2259         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2260                 return;
2261
2262         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2263                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2264                 return;
2265
2266         /*
2267          * Doing a TLB flush here, on the guest's behalf, can avoid
2268          * expensive IPIs.
2269          */
2270         if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2271                 kvm_vcpu_flush_tlb(vcpu, false);
2272
2273         if (vcpu->arch.st.steal.version & 1)
2274                 vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
2275
2276         vcpu->arch.st.steal.version += 1;
2277
2278         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2279                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2280
2281         smp_wmb();
2282
2283         vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2284                 vcpu->arch.st.last_steal;
2285         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2286
2287         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2288                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2289
2290         smp_wmb();
2291
2292         vcpu->arch.st.steal.version += 1;
2293
2294         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2295                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2296 }
2297
2298 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2299 {
2300         bool pr = false;
2301         u32 msr = msr_info->index;
2302         u64 data = msr_info->data;
2303
2304         switch (msr) {
2305         case MSR_AMD64_NB_CFG:
2306         case MSR_IA32_UCODE_WRITE:
2307         case MSR_VM_HSAVE_PA:
2308         case MSR_AMD64_PATCH_LOADER:
2309         case MSR_AMD64_BU_CFG2:
2310         case MSR_AMD64_DC_CFG:
2311                 break;
2312
2313         case MSR_IA32_UCODE_REV:
2314                 if (msr_info->host_initiated)
2315                         vcpu->arch.microcode_version = data;
2316                 break;
2317         case MSR_EFER:
2318                 return set_efer(vcpu, data);
2319         case MSR_K7_HWCR:
2320                 data &= ~(u64)0x40;     /* ignore flush filter disable */
2321                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2322                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
2323                 data &= ~(u64)0x40000;  /* ignore Mc status write enable */
2324                 if (data != 0) {
2325                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2326                                     data);
2327                         return 1;
2328                 }
2329                 break;
2330         case MSR_FAM10H_MMIO_CONF_BASE:
2331                 if (data != 0) {
2332                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2333                                     "0x%llx\n", data);
2334                         return 1;
2335                 }
2336                 break;
2337         case MSR_IA32_DEBUGCTLMSR:
2338                 if (!data) {
2339                         /* We support the non-activated case already */
2340                         break;
2341                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2342                         /* Values other than LBR and BTF are vendor-specific,
2343                            thus reserved and should throw a #GP */
2344                         return 1;
2345                 }
2346                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2347                             __func__, data);
2348                 break;
2349         case 0x200 ... 0x2ff:
2350                 return kvm_mtrr_set_msr(vcpu, msr, data);
2351         case MSR_IA32_APICBASE:
2352                 return kvm_set_apic_base(vcpu, msr_info);
2353         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2354                 return kvm_x2apic_msr_write(vcpu, msr, data);
2355         case MSR_IA32_TSCDEADLINE:
2356                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2357                 break;
2358         case MSR_IA32_TSC_ADJUST:
2359                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2360                         if (!msr_info->host_initiated) {
2361                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2362                                 adjust_tsc_offset_guest(vcpu, adj);
2363                         }
2364                         vcpu->arch.ia32_tsc_adjust_msr = data;
2365                 }
2366                 break;
2367         case MSR_IA32_MISC_ENABLE:
2368                 vcpu->arch.ia32_misc_enable_msr = data;
2369                 break;
2370         case MSR_IA32_SMBASE:
2371                 if (!msr_info->host_initiated)
2372                         return 1;
2373                 vcpu->arch.smbase = data;
2374                 break;
2375         case MSR_IA32_TSC:
2376                 kvm_write_tsc(vcpu, msr_info);
2377                 break;
2378         case MSR_SMI_COUNT:
2379                 if (!msr_info->host_initiated)
2380                         return 1;
2381                 vcpu->arch.smi_count = data;
2382                 break;
2383         case MSR_KVM_WALL_CLOCK_NEW:
2384         case MSR_KVM_WALL_CLOCK:
2385                 vcpu->kvm->arch.wall_clock = data;
2386                 kvm_write_wall_clock(vcpu->kvm, data);
2387                 break;
2388         case MSR_KVM_SYSTEM_TIME_NEW:
2389         case MSR_KVM_SYSTEM_TIME: {
2390                 struct kvm_arch *ka = &vcpu->kvm->arch;
2391
2392                 kvmclock_reset(vcpu);
2393
2394                 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2395                         bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2396
2397                         if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2398                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2399
2400                         ka->boot_vcpu_runs_old_kvmclock = tmp;
2401                 }
2402
2403                 vcpu->arch.time = data;
2404                 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2405
2406                 /* we verify if the enable bit is set... */
2407                 if (!(data & 1))
2408                         break;
2409
2410                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2411                      &vcpu->arch.pv_time, data & ~1ULL,
2412                      sizeof(struct pvclock_vcpu_time_info)))
2413                         vcpu->arch.pv_time_enabled = false;
2414                 else
2415                         vcpu->arch.pv_time_enabled = true;
2416
2417                 break;
2418         }
2419         case MSR_KVM_ASYNC_PF_EN:
2420                 if (kvm_pv_enable_async_pf(vcpu, data))
2421                         return 1;
2422                 break;
2423         case MSR_KVM_STEAL_TIME:
2424
2425                 if (unlikely(!sched_info_on()))
2426                         return 1;
2427
2428                 if (data & KVM_STEAL_RESERVED_MASK)
2429                         return 1;
2430
2431                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2432                                                 data & KVM_STEAL_VALID_BITS,
2433                                                 sizeof(struct kvm_steal_time)))
2434                         return 1;
2435
2436                 vcpu->arch.st.msr_val = data;
2437
2438                 if (!(data & KVM_MSR_ENABLED))
2439                         break;
2440
2441                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2442
2443                 break;
2444         case MSR_KVM_PV_EOI_EN:
2445                 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2446                         return 1;
2447                 break;
2448
2449         case MSR_IA32_MCG_CTL:
2450         case MSR_IA32_MCG_STATUS:
2451         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2452                 return set_msr_mce(vcpu, msr_info);
2453
2454         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2455         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2456                 pr = true; /* fall through */
2457         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2458         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2459                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2460                         return kvm_pmu_set_msr(vcpu, msr_info);
2461
2462                 if (pr || data != 0)
2463                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2464                                     "0x%x data 0x%llx\n", msr, data);
2465                 break;
2466         case MSR_K7_CLK_CTL:
2467                 /*
2468                  * Ignore all writes to this no longer documented MSR.
2469                  * Writes are only relevant for old K7 processors,
2470                  * all pre-dating SVM, but a recommended workaround from
2471                  * AMD for these chips. It is possible to specify the
2472                  * affected processor models on the command line, hence
2473                  * the need to ignore the workaround.
2474                  */
2475                 break;
2476         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2477         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2478         case HV_X64_MSR_CRASH_CTL:
2479         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2480         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2481         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2482         case HV_X64_MSR_TSC_EMULATION_STATUS:
2483                 return kvm_hv_set_msr_common(vcpu, msr, data,
2484                                              msr_info->host_initiated);
2485         case MSR_IA32_BBL_CR_CTL3:
2486                 /* Drop writes to this legacy MSR -- see rdmsr
2487                  * counterpart for further detail.
2488                  */
2489                 if (report_ignored_msrs)
2490                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2491                                 msr, data);
2492                 break;
2493         case MSR_AMD64_OSVW_ID_LENGTH:
2494                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2495                         return 1;
2496                 vcpu->arch.osvw.length = data;
2497                 break;
2498         case MSR_AMD64_OSVW_STATUS:
2499                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2500                         return 1;
2501                 vcpu->arch.osvw.status = data;
2502                 break;
2503         case MSR_PLATFORM_INFO:
2504                 if (!msr_info->host_initiated ||
2505                     data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
2506                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2507                      cpuid_fault_enabled(vcpu)))
2508                         return 1;
2509                 vcpu->arch.msr_platform_info = data;
2510                 break;
2511         case MSR_MISC_FEATURES_ENABLES:
2512                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2513                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2514                      !supports_cpuid_fault(vcpu)))
2515                         return 1;
2516                 vcpu->arch.msr_misc_features_enables = data;
2517                 break;
2518         default:
2519                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2520                         return xen_hvm_config(vcpu, data);
2521                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2522                         return kvm_pmu_set_msr(vcpu, msr_info);
2523                 if (!ignore_msrs) {
2524                         vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2525                                     msr, data);
2526                         return 1;
2527                 } else {
2528                         if (report_ignored_msrs)
2529                                 vcpu_unimpl(vcpu,
2530                                         "ignored wrmsr: 0x%x data 0x%llx\n",
2531                                         msr, data);
2532                         break;
2533                 }
2534         }
2535         return 0;
2536 }
2537 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2538
2539
2540 /*
2541  * Reads an msr value (of 'msr_index') into 'pdata'.
2542  * Returns 0 on success, non-0 otherwise.
2543  * Assumes vcpu_load() was already called.
2544  */
2545 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2546 {
2547         return kvm_x86_ops->get_msr(vcpu, msr);
2548 }
2549 EXPORT_SYMBOL_GPL(kvm_get_msr);
2550
2551 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2552 {
2553         u64 data;
2554         u64 mcg_cap = vcpu->arch.mcg_cap;
2555         unsigned bank_num = mcg_cap & 0xff;
2556
2557         switch (msr) {
2558         case MSR_IA32_P5_MC_ADDR:
2559         case MSR_IA32_P5_MC_TYPE:
2560                 data = 0;
2561                 break;
2562         case MSR_IA32_MCG_CAP:
2563                 data = vcpu->arch.mcg_cap;
2564                 break;
2565         case MSR_IA32_MCG_CTL:
2566                 if (!(mcg_cap & MCG_CTL_P))
2567                         return 1;
2568                 data = vcpu->arch.mcg_ctl;
2569                 break;
2570         case MSR_IA32_MCG_STATUS:
2571                 data = vcpu->arch.mcg_status;
2572                 break;
2573         default:
2574                 if (msr >= MSR_IA32_MC0_CTL &&
2575                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2576                         u32 offset = msr - MSR_IA32_MC0_CTL;
2577                         data = vcpu->arch.mce_banks[offset];
2578                         break;
2579                 }
2580                 return 1;
2581         }
2582         *pdata = data;
2583         return 0;
2584 }
2585
2586 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2587 {
2588         switch (msr_info->index) {
2589         case MSR_IA32_PLATFORM_ID:
2590         case MSR_IA32_EBL_CR_POWERON:
2591         case MSR_IA32_DEBUGCTLMSR:
2592         case MSR_IA32_LASTBRANCHFROMIP:
2593         case MSR_IA32_LASTBRANCHTOIP:
2594         case MSR_IA32_LASTINTFROMIP:
2595         case MSR_IA32_LASTINTTOIP:
2596         case MSR_K8_SYSCFG:
2597         case MSR_K8_TSEG_ADDR:
2598         case MSR_K8_TSEG_MASK:
2599         case MSR_K7_HWCR:
2600         case MSR_VM_HSAVE_PA:
2601         case MSR_K8_INT_PENDING_MSG:
2602         case MSR_AMD64_NB_CFG:
2603         case MSR_FAM10H_MMIO_CONF_BASE:
2604         case MSR_AMD64_BU_CFG2:
2605         case MSR_IA32_PERF_CTL:
2606         case MSR_AMD64_DC_CFG:
2607                 msr_info->data = 0;
2608                 break;
2609         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2610         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2611         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2612         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2613         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2614                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2615                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2616                 msr_info->data = 0;
2617                 break;
2618         case MSR_IA32_UCODE_REV:
2619                 msr_info->data = vcpu->arch.microcode_version;
2620                 break;
2621         case MSR_IA32_TSC:
2622                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2623                 break;
2624         case MSR_MTRRcap:
2625         case 0x200 ... 0x2ff:
2626                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2627         case 0xcd: /* fsb frequency */
2628                 msr_info->data = 3;
2629                 break;
2630                 /*
2631                  * MSR_EBC_FREQUENCY_ID
2632                  * Conservative value valid for even the basic CPU models.
2633                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2634                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2635                  * and 266MHz for model 3, or 4. Set Core Clock
2636                  * Frequency to System Bus Frequency Ratio to 1 (bits
2637                  * 31:24) even though these are only valid for CPU
2638                  * models > 2, however guests may end up dividing or
2639                  * multiplying by zero otherwise.
2640                  */
2641         case MSR_EBC_FREQUENCY_ID:
2642                 msr_info->data = 1 << 24;
2643                 break;
2644         case MSR_IA32_APICBASE:
2645                 msr_info->data = kvm_get_apic_base(vcpu);
2646                 break;
2647         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2648                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2649                 break;
2650         case MSR_IA32_TSCDEADLINE:
2651                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2652                 break;
2653         case MSR_IA32_TSC_ADJUST:
2654                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2655                 break;
2656         case MSR_IA32_MISC_ENABLE:
2657                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2658                 break;
2659         case MSR_IA32_SMBASE:
2660                 if (!msr_info->host_initiated)
2661                         return 1;
2662                 msr_info->data = vcpu->arch.smbase;
2663                 break;
2664         case MSR_SMI_COUNT:
2665                 msr_info->data = vcpu->arch.smi_count;
2666                 break;
2667         case MSR_IA32_PERF_STATUS:
2668                 /* TSC increment by tick */
2669                 msr_info->data = 1000ULL;
2670                 /* CPU multiplier */
2671                 msr_info->data |= (((uint64_t)4ULL) << 40);
2672                 break;
2673         case MSR_EFER:
2674                 msr_info->data = vcpu->arch.efer;
2675                 break;
2676         case MSR_KVM_WALL_CLOCK:
2677         case MSR_KVM_WALL_CLOCK_NEW:
2678                 msr_info->data = vcpu->kvm->arch.wall_clock;
2679                 break;
2680         case MSR_KVM_SYSTEM_TIME:
2681         case MSR_KVM_SYSTEM_TIME_NEW:
2682                 msr_info->data = vcpu->arch.time;
2683                 break;
2684         case MSR_KVM_ASYNC_PF_EN:
2685                 msr_info->data = vcpu->arch.apf.msr_val;
2686                 break;
2687         case MSR_KVM_STEAL_TIME:
2688                 msr_info->data = vcpu->arch.st.msr_val;
2689                 break;
2690         case MSR_KVM_PV_EOI_EN:
2691                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2692                 break;
2693         case MSR_IA32_P5_MC_ADDR:
2694         case MSR_IA32_P5_MC_TYPE:
2695         case MSR_IA32_MCG_CAP:
2696         case MSR_IA32_MCG_CTL:
2697         case MSR_IA32_MCG_STATUS:
2698         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2699                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
2700         case MSR_K7_CLK_CTL:
2701                 /*
2702                  * Provide expected ramp-up count for K7. All other
2703                  * are set to zero, indicating minimum divisors for
2704                  * every field.
2705                  *
2706                  * This prevents guest kernels on AMD host with CPU
2707                  * type 6, model 8 and higher from exploding due to
2708                  * the rdmsr failing.
2709                  */
2710                 msr_info->data = 0x20000000;
2711                 break;
2712         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2713         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2714         case HV_X64_MSR_CRASH_CTL:
2715         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2716         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2717         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2718         case HV_X64_MSR_TSC_EMULATION_STATUS:
2719                 return kvm_hv_get_msr_common(vcpu,
2720                                              msr_info->index, &msr_info->data);
2721                 break;
2722         case MSR_IA32_BBL_CR_CTL3:
2723                 /* This legacy MSR exists but isn't fully documented in current
2724                  * silicon.  It is however accessed by winxp in very narrow
2725                  * scenarios where it sets bit #19, itself documented as
2726                  * a "reserved" bit.  Best effort attempt to source coherent
2727                  * read data here should the balance of the register be
2728                  * interpreted by the guest:
2729                  *
2730                  * L2 cache control register 3: 64GB range, 256KB size,
2731                  * enabled, latency 0x1, configured
2732                  */
2733                 msr_info->data = 0xbe702111;
2734                 break;
2735         case MSR_AMD64_OSVW_ID_LENGTH:
2736                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2737                         return 1;
2738                 msr_info->data = vcpu->arch.osvw.length;
2739                 break;
2740         case MSR_AMD64_OSVW_STATUS:
2741                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2742                         return 1;
2743                 msr_info->data = vcpu->arch.osvw.status;
2744                 break;
2745         case MSR_PLATFORM_INFO:
2746                 msr_info->data = vcpu->arch.msr_platform_info;
2747                 break;
2748         case MSR_MISC_FEATURES_ENABLES:
2749                 msr_info->data = vcpu->arch.msr_misc_features_enables;
2750                 break;
2751         default:
2752                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2753                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2754                 if (!ignore_msrs) {
2755                         vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2756                                                msr_info->index);
2757                         return 1;
2758                 } else {
2759                         if (report_ignored_msrs)
2760                                 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2761                                         msr_info->index);
2762                         msr_info->data = 0;
2763                 }
2764                 break;
2765         }
2766         return 0;
2767 }
2768 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2769
2770 /*
2771  * Read or write a bunch of msrs. All parameters are kernel addresses.
2772  *
2773  * @return number of msrs set successfully.
2774  */
2775 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2776                     struct kvm_msr_entry *entries,
2777                     int (*do_msr)(struct kvm_vcpu *vcpu,
2778                                   unsigned index, u64 *data))
2779 {
2780         int i;
2781
2782         for (i = 0; i < msrs->nmsrs; ++i)
2783                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2784                         break;
2785
2786         return i;
2787 }
2788
2789 /*
2790  * Read or write a bunch of msrs. Parameters are user addresses.
2791  *
2792  * @return number of msrs set successfully.
2793  */
2794 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2795                   int (*do_msr)(struct kvm_vcpu *vcpu,
2796                                 unsigned index, u64 *data),
2797                   int writeback)
2798 {
2799         struct kvm_msrs msrs;
2800         struct kvm_msr_entry *entries;
2801         int r, n;
2802         unsigned size;
2803
2804         r = -EFAULT;
2805         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2806                 goto out;
2807
2808         r = -E2BIG;
2809         if (msrs.nmsrs >= MAX_IO_MSRS)
2810                 goto out;
2811
2812         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2813         entries = memdup_user(user_msrs->entries, size);
2814         if (IS_ERR(entries)) {
2815                 r = PTR_ERR(entries);
2816                 goto out;
2817         }
2818
2819         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2820         if (r < 0)
2821                 goto out_free;
2822
2823         r = -EFAULT;
2824         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2825                 goto out_free;
2826
2827         r = n;
2828
2829 out_free:
2830         kfree(entries);
2831 out:
2832         return r;
2833 }
2834
2835 static inline bool kvm_can_mwait_in_guest(void)
2836 {
2837         return boot_cpu_has(X86_FEATURE_MWAIT) &&
2838                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
2839                 boot_cpu_has(X86_FEATURE_ARAT);
2840 }
2841
2842 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2843 {
2844         int r = 0;
2845
2846         switch (ext) {
2847         case KVM_CAP_IRQCHIP:
2848         case KVM_CAP_HLT:
2849         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2850         case KVM_CAP_SET_TSS_ADDR:
2851         case KVM_CAP_EXT_CPUID:
2852         case KVM_CAP_EXT_EMUL_CPUID:
2853         case KVM_CAP_CLOCKSOURCE:
2854         case KVM_CAP_PIT:
2855         case KVM_CAP_NOP_IO_DELAY:
2856         case KVM_CAP_MP_STATE:
2857         case KVM_CAP_SYNC_MMU:
2858         case KVM_CAP_USER_NMI:
2859         case KVM_CAP_REINJECT_CONTROL:
2860         case KVM_CAP_IRQ_INJECT_STATUS:
2861         case KVM_CAP_IOEVENTFD:
2862         case KVM_CAP_IOEVENTFD_NO_LENGTH:
2863         case KVM_CAP_PIT2:
2864         case KVM_CAP_PIT_STATE2:
2865         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2866         case KVM_CAP_XEN_HVM:
2867         case KVM_CAP_VCPU_EVENTS:
2868         case KVM_CAP_HYPERV:
2869         case KVM_CAP_HYPERV_VAPIC:
2870         case KVM_CAP_HYPERV_SPIN:
2871         case KVM_CAP_HYPERV_SYNIC:
2872         case KVM_CAP_HYPERV_SYNIC2:
2873         case KVM_CAP_HYPERV_VP_INDEX:
2874         case KVM_CAP_HYPERV_EVENTFD:
2875         case KVM_CAP_HYPERV_TLBFLUSH:
2876         case KVM_CAP_PCI_SEGMENT:
2877         case KVM_CAP_DEBUGREGS:
2878         case KVM_CAP_X86_ROBUST_SINGLESTEP:
2879         case KVM_CAP_XSAVE:
2880         case KVM_CAP_ASYNC_PF:
2881         case KVM_CAP_GET_TSC_KHZ:
2882         case KVM_CAP_KVMCLOCK_CTRL:
2883         case KVM_CAP_READONLY_MEM:
2884         case KVM_CAP_HYPERV_TIME:
2885         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2886         case KVM_CAP_TSC_DEADLINE_TIMER:
2887         case KVM_CAP_ENABLE_CAP_VM:
2888         case KVM_CAP_DISABLE_QUIRKS:
2889         case KVM_CAP_SET_BOOT_CPU_ID:
2890         case KVM_CAP_SPLIT_IRQCHIP:
2891         case KVM_CAP_IMMEDIATE_EXIT:
2892         case KVM_CAP_GET_MSR_FEATURES:
2893                 r = 1;
2894                 break;
2895         case KVM_CAP_SYNC_REGS:
2896                 r = KVM_SYNC_X86_VALID_FIELDS;
2897                 break;
2898         case KVM_CAP_ADJUST_CLOCK:
2899                 r = KVM_CLOCK_TSC_STABLE;
2900                 break;
2901         case KVM_CAP_X86_DISABLE_EXITS:
2902                 r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
2903                 if(kvm_can_mwait_in_guest())
2904                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
2905                 break;
2906         case KVM_CAP_X86_SMM:
2907                 /* SMBASE is usually relocated above 1M on modern chipsets,
2908                  * and SMM handlers might indeed rely on 4G segment limits,
2909                  * so do not report SMM to be available if real mode is
2910                  * emulated via vm86 mode.  Still, do not go to great lengths
2911                  * to avoid userspace's usage of the feature, because it is a
2912                  * fringe case that is not enabled except via specific settings
2913                  * of the module parameters.
2914                  */
2915                 r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
2916                 break;
2917         case KVM_CAP_VAPIC:
2918                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2919                 break;
2920         case KVM_CAP_NR_VCPUS:
2921                 r = KVM_SOFT_MAX_VCPUS;
2922                 break;
2923         case KVM_CAP_MAX_VCPUS:
2924                 r = KVM_MAX_VCPUS;
2925                 break;
2926         case KVM_CAP_NR_MEMSLOTS:
2927                 r = KVM_USER_MEM_SLOTS;
2928                 break;
2929         case KVM_CAP_PV_MMU:    /* obsolete */
2930                 r = 0;
2931                 break;
2932         case KVM_CAP_MCE:
2933                 r = KVM_MAX_MCE_BANKS;
2934                 break;
2935         case KVM_CAP_XCRS:
2936                 r = boot_cpu_has(X86_FEATURE_XSAVE);
2937                 break;
2938         case KVM_CAP_TSC_CONTROL:
2939                 r = kvm_has_tsc_control;
2940                 break;
2941         case KVM_CAP_X2APIC_API:
2942                 r = KVM_X2APIC_API_VALID_FLAGS;
2943                 break;
2944         default:
2945                 break;
2946         }
2947         return r;
2948
2949 }
2950
2951 long kvm_arch_dev_ioctl(struct file *filp,
2952                         unsigned int ioctl, unsigned long arg)
2953 {
2954         void __user *argp = (void __user *)arg;
2955         long r;
2956
2957         switch (ioctl) {
2958         case KVM_GET_MSR_INDEX_LIST: {
2959                 struct kvm_msr_list __user *user_msr_list = argp;
2960                 struct kvm_msr_list msr_list;
2961                 unsigned n;
2962
2963                 r = -EFAULT;
2964                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2965                         goto out;
2966                 n = msr_list.nmsrs;
2967                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
2968                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2969                         goto out;
2970                 r = -E2BIG;
2971                 if (n < msr_list.nmsrs)
2972                         goto out;
2973                 r = -EFAULT;
2974                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2975                                  num_msrs_to_save * sizeof(u32)))
2976                         goto out;
2977                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2978                                  &emulated_msrs,
2979                                  num_emulated_msrs * sizeof(u32)))
2980                         goto out;
2981                 r = 0;
2982                 break;
2983         }
2984         case KVM_GET_SUPPORTED_CPUID:
2985         case KVM_GET_EMULATED_CPUID: {
2986                 struct kvm_cpuid2 __user *cpuid_arg = argp;
2987                 struct kvm_cpuid2 cpuid;
2988
2989                 r = -EFAULT;
2990                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2991                         goto out;
2992
2993                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
2994                                             ioctl);
2995                 if (r)
2996                         goto out;
2997
2998                 r = -EFAULT;
2999                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3000                         goto out;
3001                 r = 0;
3002                 break;
3003         }
3004         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3005                 r = -EFAULT;
3006                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3007                                  sizeof(kvm_mce_cap_supported)))
3008                         goto out;
3009                 r = 0;
3010                 break;
3011         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3012                 struct kvm_msr_list __user *user_msr_list = argp;
3013                 struct kvm_msr_list msr_list;
3014                 unsigned int n;
3015
3016                 r = -EFAULT;
3017                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3018                         goto out;
3019                 n = msr_list.nmsrs;
3020                 msr_list.nmsrs = num_msr_based_features;
3021                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3022                         goto out;
3023                 r = -E2BIG;
3024                 if (n < msr_list.nmsrs)
3025                         goto out;
3026                 r = -EFAULT;
3027                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3028                                  num_msr_based_features * sizeof(u32)))
3029                         goto out;
3030                 r = 0;
3031                 break;
3032         }
3033         case KVM_GET_MSRS:
3034                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3035                 break;
3036         }
3037         default:
3038                 r = -EINVAL;
3039         }
3040 out:
3041         return r;
3042 }
3043
3044 static void wbinvd_ipi(void *garbage)
3045 {
3046         wbinvd();
3047 }
3048
3049 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3050 {
3051         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3052 }
3053
3054 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3055 {
3056         /* Address WBINVD may be executed by guest */
3057         if (need_emulate_wbinvd(vcpu)) {
3058                 if (kvm_x86_ops->has_wbinvd_exit())
3059                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3060                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3061                         smp_call_function_single(vcpu->cpu,
3062                                         wbinvd_ipi, NULL, 1);
3063         }
3064
3065         kvm_x86_ops->vcpu_load(vcpu, cpu);
3066
3067         /* Apply any externally detected TSC adjustments (due to suspend) */
3068         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3069                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3070                 vcpu->arch.tsc_offset_adjustment = 0;
3071                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3072         }
3073
3074         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3075                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3076                                 rdtsc() - vcpu->arch.last_host_tsc;
3077                 if (tsc_delta < 0)
3078                         mark_tsc_unstable("KVM discovered backwards TSC");
3079
3080                 if (kvm_check_tsc_unstable()) {
3081                         u64 offset = kvm_compute_tsc_offset(vcpu,
3082                                                 vcpu->arch.last_guest_tsc);
3083                         kvm_vcpu_write_tsc_offset(vcpu, offset);
3084                         vcpu->arch.tsc_catchup = 1;
3085                 }
3086
3087                 if (kvm_lapic_hv_timer_in_use(vcpu))
3088                         kvm_lapic_restart_hv_timer(vcpu);
3089
3090                 /*
3091                  * On a host with synchronized TSC, there is no need to update
3092                  * kvmclock on vcpu->cpu migration
3093                  */
3094                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3095                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3096                 if (vcpu->cpu != cpu)
3097                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3098                 vcpu->cpu = cpu;
3099         }
3100
3101         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3102 }
3103
3104 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3105 {
3106         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3107                 return;
3108
3109         vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3110
3111         kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3112                         &vcpu->arch.st.steal.preempted,
3113                         offsetof(struct kvm_steal_time, preempted),
3114                         sizeof(vcpu->arch.st.steal.preempted));
3115 }
3116
3117 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3118 {
3119         int idx;
3120
3121         if (vcpu->preempted)
3122                 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3123
3124         /*
3125          * Disable page faults because we're in atomic context here.
3126          * kvm_write_guest_offset_cached() would call might_fault()
3127          * that relies on pagefault_disable() to tell if there's a
3128          * bug. NOTE: the write to guest memory may not go through if
3129          * during postcopy live migration or if there's heavy guest
3130          * paging.
3131          */
3132         pagefault_disable();
3133         /*
3134          * kvm_memslots() will be called by
3135          * kvm_write_guest_offset_cached() so take the srcu lock.
3136          */
3137         idx = srcu_read_lock(&vcpu->kvm->srcu);
3138         kvm_steal_time_set_preempted(vcpu);
3139         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3140         pagefault_enable();
3141         kvm_x86_ops->vcpu_put(vcpu);
3142         vcpu->arch.last_host_tsc = rdtsc();
3143         /*
3144          * If userspace has set any breakpoints or watchpoints, dr6 is restored
3145          * on every vmexit, but if not, we might have a stale dr6 from the
3146          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3147          */
3148         set_debugreg(0, 6);
3149 }
3150
3151 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3152                                     struct kvm_lapic_state *s)
3153 {
3154         if (vcpu->arch.apicv_active)
3155                 kvm_x86_ops->sync_pir_to_irr(vcpu);
3156
3157         return kvm_apic_get_state(vcpu, s);
3158 }
3159
3160 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3161                                     struct kvm_lapic_state *s)
3162 {
3163         int r;
3164
3165         r = kvm_apic_set_state(vcpu, s);
3166         if (r)
3167                 return r;
3168         update_cr8_intercept(vcpu);
3169
3170         return 0;
3171 }
3172
3173 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3174 {
3175         return (!lapic_in_kernel(vcpu) ||
3176                 kvm_apic_accept_pic_intr(vcpu));
3177 }
3178
3179 /*
3180  * if userspace requested an interrupt window, check that the
3181  * interrupt window is open.
3182  *
3183  * No need to exit to userspace if we already have an interrupt queued.
3184  */
3185 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3186 {
3187         return kvm_arch_interrupt_allowed(vcpu) &&
3188                 !kvm_cpu_has_interrupt(vcpu) &&
3189                 !kvm_event_needs_reinjection(vcpu) &&
3190                 kvm_cpu_accept_dm_intr(vcpu);
3191 }
3192
3193 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3194                                     struct kvm_interrupt *irq)
3195 {
3196         if (irq->irq >= KVM_NR_INTERRUPTS)
3197                 return -EINVAL;
3198
3199         if (!irqchip_in_kernel(vcpu->kvm)) {
3200                 kvm_queue_interrupt(vcpu, irq->irq, false);
3201                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3202                 return 0;
3203         }
3204
3205         /*
3206          * With in-kernel LAPIC, we only use this to inject EXTINT, so
3207          * fail for in-kernel 8259.
3208          */
3209         if (pic_in_kernel(vcpu->kvm))
3210                 return -ENXIO;
3211
3212         if (vcpu->arch.pending_external_vector != -1)
3213                 return -EEXIST;
3214
3215         vcpu->arch.pending_external_vector = irq->irq;
3216         kvm_make_request(KVM_REQ_EVENT, vcpu);
3217         return 0;
3218 }
3219
3220 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3221 {
3222         kvm_inject_nmi(vcpu);
3223
3224         return 0;
3225 }
3226
3227 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3228 {
3229         kvm_make_request(KVM_REQ_SMI, vcpu);
3230
3231         return 0;
3232 }
3233
3234 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3235                                            struct kvm_tpr_access_ctl *tac)
3236 {
3237         if (tac->flags)
3238                 return -EINVAL;
3239         vcpu->arch.tpr_access_reporting = !!tac->enabled;
3240         return 0;
3241 }
3242
3243 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3244                                         u64 mcg_cap)
3245 {
3246         int r;
3247         unsigned bank_num = mcg_cap & 0xff, bank;
3248
3249         r = -EINVAL;
3250         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3251                 goto out;
3252         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3253                 goto out;
3254         r = 0;
3255         vcpu->arch.mcg_cap = mcg_cap;
3256         /* Init IA32_MCG_CTL to all 1s */
3257         if (mcg_cap & MCG_CTL_P)
3258                 vcpu->arch.mcg_ctl = ~(u64)0;
3259         /* Init IA32_MCi_CTL to all 1s */
3260         for (bank = 0; bank < bank_num; bank++)
3261                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3262
3263         if (kvm_x86_ops->setup_mce)
3264                 kvm_x86_ops->setup_mce(vcpu);
3265 out:
3266         return r;
3267 }
3268
3269 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3270                                       struct kvm_x86_mce *mce)
3271 {
3272         u64 mcg_cap = vcpu->arch.mcg_cap;
3273         unsigned bank_num = mcg_cap & 0xff;
3274         u64 *banks = vcpu->arch.mce_banks;
3275
3276         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3277                 return -EINVAL;
3278         /*
3279          * if IA32_MCG_CTL is not all 1s, the uncorrected error
3280          * reporting is disabled
3281          */
3282         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3283             vcpu->arch.mcg_ctl != ~(u64)0)
3284                 return 0;
3285         banks += 4 * mce->bank;
3286         /*
3287          * if IA32_MCi_CTL is not all 1s, the uncorrected error
3288          * reporting is disabled for the bank
3289          */
3290         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3291                 return 0;
3292         if (mce->status & MCI_STATUS_UC) {
3293                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3294                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3295                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3296                         return 0;
3297                 }
3298                 if (banks[1] & MCI_STATUS_VAL)
3299                         mce->status |= MCI_STATUS_OVER;
3300                 banks[2] = mce->addr;
3301                 banks[3] = mce->misc;
3302                 vcpu->arch.mcg_status = mce->mcg_status;
3303                 banks[1] = mce->status;
3304                 kvm_queue_exception(vcpu, MC_VECTOR);
3305         } else if (!(banks[1] & MCI_STATUS_VAL)
3306                    || !(banks[1] & MCI_STATUS_UC)) {
3307                 if (banks[1] & MCI_STATUS_VAL)
3308                         mce->status |= MCI_STATUS_OVER;
3309                 banks[2] = mce->addr;
3310                 banks[3] = mce->misc;
3311                 banks[1] = mce->status;
3312         } else
3313                 banks[1] |= MCI_STATUS_OVER;
3314         return 0;
3315 }
3316
3317 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3318                                                struct kvm_vcpu_events *events)
3319 {
3320         process_nmi(vcpu);
3321         /*
3322          * FIXME: pass injected and pending separately.  This is only
3323          * needed for nested virtualization, whose state cannot be
3324          * migrated yet.  For now we can combine them.
3325          */
3326         events->exception.injected =
3327                 (vcpu->arch.exception.pending ||
3328                  vcpu->arch.exception.injected) &&
3329                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3330         events->exception.nr = vcpu->arch.exception.nr;
3331         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3332         events->exception.pad = 0;
3333         events->exception.error_code = vcpu->arch.exception.error_code;
3334
3335         events->interrupt.injected =
3336                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3337         events->interrupt.nr = vcpu->arch.interrupt.nr;
3338         events->interrupt.soft = 0;
3339         events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3340
3341         events->nmi.injected = vcpu->arch.nmi_injected;
3342         events->nmi.pending = vcpu->arch.nmi_pending != 0;
3343         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3344         events->nmi.pad = 0;
3345
3346         events->sipi_vector = 0; /* never valid when reporting to user space */
3347
3348         events->smi.smm = is_smm(vcpu);
3349         events->smi.pending = vcpu->arch.smi_pending;
3350         events->smi.smm_inside_nmi =
3351                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3352         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3353
3354         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3355                          | KVM_VCPUEVENT_VALID_SHADOW
3356                          | KVM_VCPUEVENT_VALID_SMM);
3357         memset(&events->reserved, 0, sizeof(events->reserved));
3358 }
3359
3360 static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
3361
3362 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3363                                               struct kvm_vcpu_events *events)
3364 {
3365         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3366                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3367                               | KVM_VCPUEVENT_VALID_SHADOW
3368                               | KVM_VCPUEVENT_VALID_SMM))
3369                 return -EINVAL;
3370
3371         if (events->exception.injected &&
3372             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3373              is_guest_mode(vcpu)))
3374                 return -EINVAL;
3375
3376         /* INITs are latched while in SMM */
3377         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3378             (events->smi.smm || events->smi.pending) &&
3379             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3380                 return -EINVAL;
3381
3382         process_nmi(vcpu);
3383         vcpu->arch.exception.injected = false;
3384         vcpu->arch.exception.pending = events->exception.injected;
3385         vcpu->arch.exception.nr = events->exception.nr;
3386         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3387         vcpu->arch.exception.error_code = events->exception.error_code;
3388
3389         vcpu->arch.interrupt.injected = events->interrupt.injected;
3390         vcpu->arch.interrupt.nr = events->interrupt.nr;
3391         vcpu->arch.interrupt.soft = events->interrupt.soft;
3392         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3393                 kvm_x86_ops->set_interrupt_shadow(vcpu,
3394                                                   events->interrupt.shadow);
3395
3396         vcpu->arch.nmi_injected = events->nmi.injected;
3397         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3398                 vcpu->arch.nmi_pending = events->nmi.pending;
3399         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3400
3401         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3402             lapic_in_kernel(vcpu))
3403                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3404
3405         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3406                 u32 hflags = vcpu->arch.hflags;
3407                 if (events->smi.smm)
3408                         hflags |= HF_SMM_MASK;
3409                 else
3410                         hflags &= ~HF_SMM_MASK;
3411                 kvm_set_hflags(vcpu, hflags);
3412
3413                 vcpu->arch.smi_pending = events->smi.pending;
3414
3415                 if (events->smi.smm) {
3416                         if (events->smi.smm_inside_nmi)
3417                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3418                         else
3419                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3420                         if (lapic_in_kernel(vcpu)) {
3421                                 if (events->smi.latched_init)
3422                                         set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3423                                 else
3424                                         clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3425                         }
3426                 }
3427         }
3428
3429         kvm_make_request(KVM_REQ_EVENT, vcpu);
3430
3431         return 0;
3432 }
3433
3434 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3435                                              struct kvm_debugregs *dbgregs)
3436 {
3437         unsigned long val;
3438
3439         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3440         kvm_get_dr(vcpu, 6, &val);
3441         dbgregs->dr6 = val;
3442         dbgregs->dr7 = vcpu->arch.dr7;
3443         dbgregs->flags = 0;
3444         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3445 }
3446
3447 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3448                                             struct kvm_debugregs *dbgregs)
3449 {
3450         if (dbgregs->flags)
3451                 return -EINVAL;
3452
3453         if (dbgregs->dr6 & ~0xffffffffull)
3454                 return -EINVAL;
3455         if (dbgregs->dr7 & ~0xffffffffull)
3456                 return -EINVAL;
3457
3458         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3459         kvm_update_dr0123(vcpu);
3460         vcpu->arch.dr6 = dbgregs->dr6;
3461         kvm_update_dr6(vcpu);
3462         vcpu->arch.dr7 = dbgregs->dr7;
3463         kvm_update_dr7(vcpu);
3464
3465         return 0;
3466 }
3467
3468 #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
3469
3470 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3471 {
3472         struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
3473         u64 xstate_bv = xsave->header.xfeatures;
3474         u64 valid;
3475
3476         /*
3477          * Copy legacy XSAVE area, to avoid complications with CPUID
3478          * leaves 0 and 1 in the loop below.
3479          */
3480         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
3481
3482         /* Set XSTATE_BV */
3483         xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
3484         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
3485
3486         /*
3487          * Copy each region from the possibly compacted offset to the
3488          * non-compacted offset.
3489          */
3490         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3491         while (valid) {
3492                 u64 feature = valid & -valid;
3493                 int index = fls64(feature) - 1;
3494                 void *src = get_xsave_addr(xsave, feature);
3495
3496                 if (src) {
3497                         u32 size, offset, ecx, edx;
3498                         cpuid_count(XSTATE_CPUID, index,
3499                                     &size, &offset, &ecx, &edx);
3500                         if (feature == XFEATURE_MASK_PKRU)
3501                                 memcpy(dest + offset, &vcpu->arch