KVM: x86: fix #UD address of failed Hyper-V hypercalls
[muen/linux.git] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Amit Shah    <amit.shah@qumranet.com>
15  *   Ben-Ami Yassour <benami@il.ibm.com>
16  *
17  * This work is licensed under the terms of the GNU GPL, version 2.  See
18  * the COPYING file in the top-level directory.
19  *
20  */
21
22 #include <linux/kvm_host.h>
23 #include "irq.h"
24 #include "mmu.h"
25 #include "i8254.h"
26 #include "tss.h"
27 #include "kvm_cache_regs.h"
28 #include "x86.h"
29 #include "cpuid.h"
30 #include "pmu.h"
31 #include "hyperv.h"
32
33 #include <linux/clocksource.h>
34 #include <linux/interrupt.h>
35 #include <linux/kvm.h>
36 #include <linux/fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/export.h>
39 #include <linux/moduleparam.h>
40 #include <linux/mman.h>
41 #include <linux/highmem.h>
42 #include <linux/iommu.h>
43 #include <linux/intel-iommu.h>
44 #include <linux/cpufreq.h>
45 #include <linux/user-return-notifier.h>
46 #include <linux/srcu.h>
47 #include <linux/slab.h>
48 #include <linux/perf_event.h>
49 #include <linux/uaccess.h>
50 #include <linux/hash.h>
51 #include <linux/pci.h>
52 #include <linux/timekeeper_internal.h>
53 #include <linux/pvclock_gtod.h>
54 #include <linux/kvm_irqfd.h>
55 #include <linux/irqbypass.h>
56 #include <linux/sched/stat.h>
57 #include <linux/mem_encrypt.h>
58
59 #include <trace/events/kvm.h>
60
61 #include <asm/debugreg.h>
62 #include <asm/msr.h>
63 #include <asm/desc.h>
64 #include <asm/mce.h>
65 #include <linux/kernel_stat.h>
66 #include <asm/fpu/internal.h> /* Ugh! */
67 #include <asm/pvclock.h>
68 #include <asm/div64.h>
69 #include <asm/irq_remapping.h>
70 #include <asm/mshyperv.h>
71 #include <asm/hypervisor.h>
72
73 #define CREATE_TRACE_POINTS
74 #include "trace.h"
75
76 #define MAX_IO_MSRS 256
77 #define KVM_MAX_MCE_BANKS 32
78 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
79 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
80
81 #define emul_to_vcpu(ctxt) \
82         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
83
84 /* EFER defaults:
85  * - enable syscall per default because its emulated by KVM
86  * - enable LME and LMA per default on 64 bit KVM
87  */
88 #ifdef CONFIG_X86_64
89 static
90 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
91 #else
92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
93 #endif
94
95 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
96 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
97
98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
99                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
100
101 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102 static void process_nmi(struct kvm_vcpu *vcpu);
103 static void enter_smm(struct kvm_vcpu *vcpu);
104 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105 static void store_regs(struct kvm_vcpu *vcpu);
106 static int sync_regs(struct kvm_vcpu *vcpu);
107
108 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
109 EXPORT_SYMBOL_GPL(kvm_x86_ops);
110
111 static bool __read_mostly ignore_msrs = 0;
112 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
113
114 static bool __read_mostly report_ignored_msrs = true;
115 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
116
117 unsigned int min_timer_period_us = 200;
118 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
119
120 static bool __read_mostly kvmclock_periodic_sync = true;
121 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
122
123 bool __read_mostly kvm_has_tsc_control;
124 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
125 u32  __read_mostly kvm_max_guest_tsc_khz;
126 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
127 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
128 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
129 u64  __read_mostly kvm_max_tsc_scaling_ratio;
130 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
131 u64 __read_mostly kvm_default_tsc_scaling_ratio;
132 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
133
134 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
135 static u32 __read_mostly tsc_tolerance_ppm = 250;
136 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
137
138 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
139 unsigned int __read_mostly lapic_timer_advance_ns = 0;
140 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
141
142 static bool __read_mostly vector_hashing = true;
143 module_param(vector_hashing, bool, S_IRUGO);
144
145 bool __read_mostly enable_vmware_backdoor = false;
146 module_param(enable_vmware_backdoor, bool, S_IRUGO);
147 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
148
149 static bool __read_mostly force_emulation_prefix = false;
150 module_param(force_emulation_prefix, bool, S_IRUGO);
151
152 #define KVM_NR_SHARED_MSRS 16
153
154 struct kvm_shared_msrs_global {
155         int nr;
156         u32 msrs[KVM_NR_SHARED_MSRS];
157 };
158
159 struct kvm_shared_msrs {
160         struct user_return_notifier urn;
161         bool registered;
162         struct kvm_shared_msr_values {
163                 u64 host;
164                 u64 curr;
165         } values[KVM_NR_SHARED_MSRS];
166 };
167
168 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
169 static struct kvm_shared_msrs __percpu *shared_msrs;
170
171 struct kvm_stats_debugfs_item debugfs_entries[] = {
172         { "pf_fixed", VCPU_STAT(pf_fixed) },
173         { "pf_guest", VCPU_STAT(pf_guest) },
174         { "tlb_flush", VCPU_STAT(tlb_flush) },
175         { "invlpg", VCPU_STAT(invlpg) },
176         { "exits", VCPU_STAT(exits) },
177         { "io_exits", VCPU_STAT(io_exits) },
178         { "mmio_exits", VCPU_STAT(mmio_exits) },
179         { "signal_exits", VCPU_STAT(signal_exits) },
180         { "irq_window", VCPU_STAT(irq_window_exits) },
181         { "nmi_window", VCPU_STAT(nmi_window_exits) },
182         { "halt_exits", VCPU_STAT(halt_exits) },
183         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
184         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
185         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
186         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
187         { "hypercalls", VCPU_STAT(hypercalls) },
188         { "request_irq", VCPU_STAT(request_irq_exits) },
189         { "irq_exits", VCPU_STAT(irq_exits) },
190         { "host_state_reload", VCPU_STAT(host_state_reload) },
191         { "fpu_reload", VCPU_STAT(fpu_reload) },
192         { "insn_emulation", VCPU_STAT(insn_emulation) },
193         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
194         { "irq_injections", VCPU_STAT(irq_injections) },
195         { "nmi_injections", VCPU_STAT(nmi_injections) },
196         { "req_event", VCPU_STAT(req_event) },
197         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
198         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
199         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
200         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
201         { "mmu_flooded", VM_STAT(mmu_flooded) },
202         { "mmu_recycled", VM_STAT(mmu_recycled) },
203         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
204         { "mmu_unsync", VM_STAT(mmu_unsync) },
205         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
206         { "largepages", VM_STAT(lpages) },
207         { "max_mmu_page_hash_collisions",
208                 VM_STAT(max_mmu_page_hash_collisions) },
209         { NULL }
210 };
211
212 u64 __read_mostly host_xcr0;
213
214 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
215
216 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
217 {
218         int i;
219         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
220                 vcpu->arch.apf.gfns[i] = ~0;
221 }
222
223 static void kvm_on_user_return(struct user_return_notifier *urn)
224 {
225         unsigned slot;
226         struct kvm_shared_msrs *locals
227                 = container_of(urn, struct kvm_shared_msrs, urn);
228         struct kvm_shared_msr_values *values;
229         unsigned long flags;
230
231         /*
232          * Disabling irqs at this point since the following code could be
233          * interrupted and executed through kvm_arch_hardware_disable()
234          */
235         local_irq_save(flags);
236         if (locals->registered) {
237                 locals->registered = false;
238                 user_return_notifier_unregister(urn);
239         }
240         local_irq_restore(flags);
241         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
242                 values = &locals->values[slot];
243                 if (values->host != values->curr) {
244                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
245                         values->curr = values->host;
246                 }
247         }
248 }
249
250 static void shared_msr_update(unsigned slot, u32 msr)
251 {
252         u64 value;
253         unsigned int cpu = smp_processor_id();
254         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
255
256         /* only read, and nobody should modify it at this time,
257          * so don't need lock */
258         if (slot >= shared_msrs_global.nr) {
259                 printk(KERN_ERR "kvm: invalid MSR slot!");
260                 return;
261         }
262         rdmsrl_safe(msr, &value);
263         smsr->values[slot].host = value;
264         smsr->values[slot].curr = value;
265 }
266
267 void kvm_define_shared_msr(unsigned slot, u32 msr)
268 {
269         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
270         shared_msrs_global.msrs[slot] = msr;
271         if (slot >= shared_msrs_global.nr)
272                 shared_msrs_global.nr = slot + 1;
273 }
274 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
275
276 static void kvm_shared_msr_cpu_online(void)
277 {
278         unsigned i;
279
280         for (i = 0; i < shared_msrs_global.nr; ++i)
281                 shared_msr_update(i, shared_msrs_global.msrs[i]);
282 }
283
284 int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
285 {
286         unsigned int cpu = smp_processor_id();
287         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
288         int err;
289
290         if (((value ^ smsr->values[slot].curr) & mask) == 0)
291                 return 0;
292         smsr->values[slot].curr = value;
293         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
294         if (err)
295                 return 1;
296
297         if (!smsr->registered) {
298                 smsr->urn.on_user_return = kvm_on_user_return;
299                 user_return_notifier_register(&smsr->urn);
300                 smsr->registered = true;
301         }
302         return 0;
303 }
304 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
305
306 static void drop_user_return_notifiers(void)
307 {
308         unsigned int cpu = smp_processor_id();
309         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
310
311         if (smsr->registered)
312                 kvm_on_user_return(&smsr->urn);
313 }
314
315 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
316 {
317         return vcpu->arch.apic_base;
318 }
319 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
320
321 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
322 {
323         u64 old_state = vcpu->arch.apic_base &
324                 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
325         u64 new_state = msr_info->data &
326                 (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
327         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
328                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
329
330         if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE)
331                 return 1;
332         if (!msr_info->host_initiated &&
333             ((new_state == MSR_IA32_APICBASE_ENABLE &&
334               old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
335              (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
336               old_state == 0)))
337                 return 1;
338
339         kvm_lapic_set_base(vcpu, msr_info->data);
340         return 0;
341 }
342 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
343
344 asmlinkage __visible void kvm_spurious_fault(void)
345 {
346         /* Fault while not rebooting.  We want the trace. */
347         BUG();
348 }
349 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
350
351 #define EXCPT_BENIGN            0
352 #define EXCPT_CONTRIBUTORY      1
353 #define EXCPT_PF                2
354
355 static int exception_class(int vector)
356 {
357         switch (vector) {
358         case PF_VECTOR:
359                 return EXCPT_PF;
360         case DE_VECTOR:
361         case TS_VECTOR:
362         case NP_VECTOR:
363         case SS_VECTOR:
364         case GP_VECTOR:
365                 return EXCPT_CONTRIBUTORY;
366         default:
367                 break;
368         }
369         return EXCPT_BENIGN;
370 }
371
372 #define EXCPT_FAULT             0
373 #define EXCPT_TRAP              1
374 #define EXCPT_ABORT             2
375 #define EXCPT_INTERRUPT         3
376
377 static int exception_type(int vector)
378 {
379         unsigned int mask;
380
381         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
382                 return EXCPT_INTERRUPT;
383
384         mask = 1 << vector;
385
386         /* #DB is trap, as instruction watchpoints are handled elsewhere */
387         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
388                 return EXCPT_TRAP;
389
390         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
391                 return EXCPT_ABORT;
392
393         /* Reserved exceptions will result in fault */
394         return EXCPT_FAULT;
395 }
396
397 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
398                 unsigned nr, bool has_error, u32 error_code,
399                 bool reinject)
400 {
401         u32 prev_nr;
402         int class1, class2;
403
404         kvm_make_request(KVM_REQ_EVENT, vcpu);
405
406         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
407         queue:
408                 if (has_error && !is_protmode(vcpu))
409                         has_error = false;
410                 if (reinject) {
411                         /*
412                          * On vmentry, vcpu->arch.exception.pending is only
413                          * true if an event injection was blocked by
414                          * nested_run_pending.  In that case, however,
415                          * vcpu_enter_guest requests an immediate exit,
416                          * and the guest shouldn't proceed far enough to
417                          * need reinjection.
418                          */
419                         WARN_ON_ONCE(vcpu->arch.exception.pending);
420                         vcpu->arch.exception.injected = true;
421                 } else {
422                         vcpu->arch.exception.pending = true;
423                         vcpu->arch.exception.injected = false;
424                 }
425                 vcpu->arch.exception.has_error_code = has_error;
426                 vcpu->arch.exception.nr = nr;
427                 vcpu->arch.exception.error_code = error_code;
428                 return;
429         }
430
431         /* to check exception */
432         prev_nr = vcpu->arch.exception.nr;
433         if (prev_nr == DF_VECTOR) {
434                 /* triple fault -> shutdown */
435                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
436                 return;
437         }
438         class1 = exception_class(prev_nr);
439         class2 = exception_class(nr);
440         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
441                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
442                 /*
443                  * Generate double fault per SDM Table 5-5.  Set
444                  * exception.pending = true so that the double fault
445                  * can trigger a nested vmexit.
446                  */
447                 vcpu->arch.exception.pending = true;
448                 vcpu->arch.exception.injected = false;
449                 vcpu->arch.exception.has_error_code = true;
450                 vcpu->arch.exception.nr = DF_VECTOR;
451                 vcpu->arch.exception.error_code = 0;
452         } else
453                 /* replace previous exception with a new one in a hope
454                    that instruction re-execution will regenerate lost
455                    exception */
456                 goto queue;
457 }
458
459 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
460 {
461         kvm_multiple_exception(vcpu, nr, false, 0, false);
462 }
463 EXPORT_SYMBOL_GPL(kvm_queue_exception);
464
465 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
466 {
467         kvm_multiple_exception(vcpu, nr, false, 0, true);
468 }
469 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
470
471 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
472 {
473         if (err)
474                 kvm_inject_gp(vcpu, 0);
475         else
476                 return kvm_skip_emulated_instruction(vcpu);
477
478         return 1;
479 }
480 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
481
482 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
483 {
484         ++vcpu->stat.pf_guest;
485         vcpu->arch.exception.nested_apf =
486                 is_guest_mode(vcpu) && fault->async_page_fault;
487         if (vcpu->arch.exception.nested_apf)
488                 vcpu->arch.apf.nested_apf_token = fault->address;
489         else
490                 vcpu->arch.cr2 = fault->address;
491         kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
492 }
493 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
494
495 static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
496 {
497         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
498                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
499         else
500                 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
501
502         return fault->nested_page_fault;
503 }
504
505 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
506 {
507         atomic_inc(&vcpu->arch.nmi_queued);
508         kvm_make_request(KVM_REQ_NMI, vcpu);
509 }
510 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
511
512 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
513 {
514         kvm_multiple_exception(vcpu, nr, true, error_code, false);
515 }
516 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
517
518 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
519 {
520         kvm_multiple_exception(vcpu, nr, true, error_code, true);
521 }
522 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
523
524 /*
525  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
526  * a #GP and return false.
527  */
528 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
529 {
530         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
531                 return true;
532         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
533         return false;
534 }
535 EXPORT_SYMBOL_GPL(kvm_require_cpl);
536
537 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
538 {
539         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
540                 return true;
541
542         kvm_queue_exception(vcpu, UD_VECTOR);
543         return false;
544 }
545 EXPORT_SYMBOL_GPL(kvm_require_dr);
546
547 /*
548  * This function will be used to read from the physical memory of the currently
549  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
550  * can read from guest physical or from the guest's guest physical memory.
551  */
552 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
553                             gfn_t ngfn, void *data, int offset, int len,
554                             u32 access)
555 {
556         struct x86_exception exception;
557         gfn_t real_gfn;
558         gpa_t ngpa;
559
560         ngpa     = gfn_to_gpa(ngfn);
561         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
562         if (real_gfn == UNMAPPED_GVA)
563                 return -EFAULT;
564
565         real_gfn = gpa_to_gfn(real_gfn);
566
567         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
568 }
569 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
570
571 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
572                                void *data, int offset, int len, u32 access)
573 {
574         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
575                                        data, offset, len, access);
576 }
577
578 /*
579  * Load the pae pdptrs.  Return true is they are all valid.
580  */
581 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
582 {
583         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
584         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
585         int i;
586         int ret;
587         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
588
589         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
590                                       offset * sizeof(u64), sizeof(pdpte),
591                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
592         if (ret < 0) {
593                 ret = 0;
594                 goto out;
595         }
596         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
597                 if ((pdpte[i] & PT_PRESENT_MASK) &&
598                     (pdpte[i] &
599                      vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
600                         ret = 0;
601                         goto out;
602                 }
603         }
604         ret = 1;
605
606         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
607         __set_bit(VCPU_EXREG_PDPTR,
608                   (unsigned long *)&vcpu->arch.regs_avail);
609         __set_bit(VCPU_EXREG_PDPTR,
610                   (unsigned long *)&vcpu->arch.regs_dirty);
611 out:
612
613         return ret;
614 }
615 EXPORT_SYMBOL_GPL(load_pdptrs);
616
617 bool pdptrs_changed(struct kvm_vcpu *vcpu)
618 {
619         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
620         bool changed = true;
621         int offset;
622         gfn_t gfn;
623         int r;
624
625         if (is_long_mode(vcpu) || !is_pae(vcpu))
626                 return false;
627
628         if (!test_bit(VCPU_EXREG_PDPTR,
629                       (unsigned long *)&vcpu->arch.regs_avail))
630                 return true;
631
632         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
633         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
634         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
635                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
636         if (r < 0)
637                 goto out;
638         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
639 out:
640
641         return changed;
642 }
643 EXPORT_SYMBOL_GPL(pdptrs_changed);
644
645 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
646 {
647         unsigned long old_cr0 = kvm_read_cr0(vcpu);
648         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
649
650         cr0 |= X86_CR0_ET;
651
652 #ifdef CONFIG_X86_64
653         if (cr0 & 0xffffffff00000000UL)
654                 return 1;
655 #endif
656
657         cr0 &= ~CR0_RESERVED_BITS;
658
659         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
660                 return 1;
661
662         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
663                 return 1;
664
665         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
666 #ifdef CONFIG_X86_64
667                 if ((vcpu->arch.efer & EFER_LME)) {
668                         int cs_db, cs_l;
669
670                         if (!is_pae(vcpu))
671                                 return 1;
672                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
673                         if (cs_l)
674                                 return 1;
675                 } else
676 #endif
677                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
678                                                  kvm_read_cr3(vcpu)))
679                         return 1;
680         }
681
682         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
683                 return 1;
684
685         kvm_x86_ops->set_cr0(vcpu, cr0);
686
687         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
688                 kvm_clear_async_pf_completion_queue(vcpu);
689                 kvm_async_pf_hash_reset(vcpu);
690         }
691
692         if ((cr0 ^ old_cr0) & update_bits)
693                 kvm_mmu_reset_context(vcpu);
694
695         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
696             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
697             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
698                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
699
700         return 0;
701 }
702 EXPORT_SYMBOL_GPL(kvm_set_cr0);
703
704 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
705 {
706         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
707 }
708 EXPORT_SYMBOL_GPL(kvm_lmsw);
709
710 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
711 {
712         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
713                         !vcpu->guest_xcr0_loaded) {
714                 /* kvm_set_xcr() also depends on this */
715                 if (vcpu->arch.xcr0 != host_xcr0)
716                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
717                 vcpu->guest_xcr0_loaded = 1;
718         }
719 }
720
721 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
722 {
723         if (vcpu->guest_xcr0_loaded) {
724                 if (vcpu->arch.xcr0 != host_xcr0)
725                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
726                 vcpu->guest_xcr0_loaded = 0;
727         }
728 }
729
730 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
731 {
732         u64 xcr0 = xcr;
733         u64 old_xcr0 = vcpu->arch.xcr0;
734         u64 valid_bits;
735
736         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
737         if (index != XCR_XFEATURE_ENABLED_MASK)
738                 return 1;
739         if (!(xcr0 & XFEATURE_MASK_FP))
740                 return 1;
741         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
742                 return 1;
743
744         /*
745          * Do not allow the guest to set bits that we do not support
746          * saving.  However, xcr0 bit 0 is always set, even if the
747          * emulated CPU does not support XSAVE (see fx_init).
748          */
749         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
750         if (xcr0 & ~valid_bits)
751                 return 1;
752
753         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
754             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
755                 return 1;
756
757         if (xcr0 & XFEATURE_MASK_AVX512) {
758                 if (!(xcr0 & XFEATURE_MASK_YMM))
759                         return 1;
760                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
761                         return 1;
762         }
763         vcpu->arch.xcr0 = xcr0;
764
765         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
766                 kvm_update_cpuid(vcpu);
767         return 0;
768 }
769
770 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
771 {
772         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
773             __kvm_set_xcr(vcpu, index, xcr)) {
774                 kvm_inject_gp(vcpu, 0);
775                 return 1;
776         }
777         return 0;
778 }
779 EXPORT_SYMBOL_GPL(kvm_set_xcr);
780
781 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
782 {
783         unsigned long old_cr4 = kvm_read_cr4(vcpu);
784         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
785                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
786
787         if (cr4 & CR4_RESERVED_BITS)
788                 return 1;
789
790         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
791                 return 1;
792
793         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
794                 return 1;
795
796         if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
797                 return 1;
798
799         if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
800                 return 1;
801
802         if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
803                 return 1;
804
805         if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
806                 return 1;
807
808         if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
809                 return 1;
810
811         if (is_long_mode(vcpu)) {
812                 if (!(cr4 & X86_CR4_PAE))
813                         return 1;
814         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
815                    && ((cr4 ^ old_cr4) & pdptr_bits)
816                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
817                                    kvm_read_cr3(vcpu)))
818                 return 1;
819
820         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
821                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
822                         return 1;
823
824                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
825                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
826                         return 1;
827         }
828
829         if (kvm_x86_ops->set_cr4(vcpu, cr4))
830                 return 1;
831
832         if (((cr4 ^ old_cr4) & pdptr_bits) ||
833             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
834                 kvm_mmu_reset_context(vcpu);
835
836         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
837                 kvm_update_cpuid(vcpu);
838
839         return 0;
840 }
841 EXPORT_SYMBOL_GPL(kvm_set_cr4);
842
843 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
844 {
845 #ifdef CONFIG_X86_64
846         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
847
848         if (pcid_enabled)
849                 cr3 &= ~CR3_PCID_INVD;
850 #endif
851
852         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
853                 kvm_mmu_sync_roots(vcpu);
854                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
855                 return 0;
856         }
857
858         if (is_long_mode(vcpu) &&
859             (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
860                 return 1;
861         else if (is_pae(vcpu) && is_paging(vcpu) &&
862                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
863                 return 1;
864
865         vcpu->arch.cr3 = cr3;
866         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
867         kvm_mmu_new_cr3(vcpu);
868         return 0;
869 }
870 EXPORT_SYMBOL_GPL(kvm_set_cr3);
871
872 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
873 {
874         if (cr8 & CR8_RESERVED_BITS)
875                 return 1;
876         if (lapic_in_kernel(vcpu))
877                 kvm_lapic_set_tpr(vcpu, cr8);
878         else
879                 vcpu->arch.cr8 = cr8;
880         return 0;
881 }
882 EXPORT_SYMBOL_GPL(kvm_set_cr8);
883
884 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
885 {
886         if (lapic_in_kernel(vcpu))
887                 return kvm_lapic_get_cr8(vcpu);
888         else
889                 return vcpu->arch.cr8;
890 }
891 EXPORT_SYMBOL_GPL(kvm_get_cr8);
892
893 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
894 {
895         int i;
896
897         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
898                 for (i = 0; i < KVM_NR_DB_REGS; i++)
899                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
900                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
901         }
902 }
903
904 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
905 {
906         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
907                 kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
908 }
909
910 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
911 {
912         unsigned long dr7;
913
914         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
915                 dr7 = vcpu->arch.guest_debug_dr7;
916         else
917                 dr7 = vcpu->arch.dr7;
918         kvm_x86_ops->set_dr7(vcpu, dr7);
919         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
920         if (dr7 & DR7_BP_EN_MASK)
921                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
922 }
923
924 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
925 {
926         u64 fixed = DR6_FIXED_1;
927
928         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
929                 fixed |= DR6_RTM;
930         return fixed;
931 }
932
933 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
934 {
935         switch (dr) {
936         case 0 ... 3:
937                 vcpu->arch.db[dr] = val;
938                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
939                         vcpu->arch.eff_db[dr] = val;
940                 break;
941         case 4:
942                 /* fall through */
943         case 6:
944                 if (val & 0xffffffff00000000ULL)
945                         return -1; /* #GP */
946                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
947                 kvm_update_dr6(vcpu);
948                 break;
949         case 5:
950                 /* fall through */
951         default: /* 7 */
952                 if (val & 0xffffffff00000000ULL)
953                         return -1; /* #GP */
954                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
955                 kvm_update_dr7(vcpu);
956                 break;
957         }
958
959         return 0;
960 }
961
962 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
963 {
964         if (__kvm_set_dr(vcpu, dr, val)) {
965                 kvm_inject_gp(vcpu, 0);
966                 return 1;
967         }
968         return 0;
969 }
970 EXPORT_SYMBOL_GPL(kvm_set_dr);
971
972 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
973 {
974         switch (dr) {
975         case 0 ... 3:
976                 *val = vcpu->arch.db[dr];
977                 break;
978         case 4:
979                 /* fall through */
980         case 6:
981                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
982                         *val = vcpu->arch.dr6;
983                 else
984                         *val = kvm_x86_ops->get_dr6(vcpu);
985                 break;
986         case 5:
987                 /* fall through */
988         default: /* 7 */
989                 *val = vcpu->arch.dr7;
990                 break;
991         }
992         return 0;
993 }
994 EXPORT_SYMBOL_GPL(kvm_get_dr);
995
996 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
997 {
998         u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
999         u64 data;
1000         int err;
1001
1002         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1003         if (err)
1004                 return err;
1005         kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1006         kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1007         return err;
1008 }
1009 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1010
1011 /*
1012  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1013  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1014  *
1015  * This list is modified at module load time to reflect the
1016  * capabilities of the host cpu. This capabilities test skips MSRs that are
1017  * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1018  * may depend on host virtualization features rather than host cpu features.
1019  */
1020
1021 static u32 msrs_to_save[] = {
1022         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1023         MSR_STAR,
1024 #ifdef CONFIG_X86_64
1025         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1026 #endif
1027         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1028         MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1029         MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1030 };
1031
1032 static unsigned num_msrs_to_save;
1033
1034 static u32 emulated_msrs[] = {
1035         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1036         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1037         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1038         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1039         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1040         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1041         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1042         HV_X64_MSR_RESET,
1043         HV_X64_MSR_VP_INDEX,
1044         HV_X64_MSR_VP_RUNTIME,
1045         HV_X64_MSR_SCONTROL,
1046         HV_X64_MSR_STIMER0_CONFIG,
1047         HV_X64_MSR_VP_ASSIST_PAGE,
1048         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1049         HV_X64_MSR_TSC_EMULATION_STATUS,
1050
1051         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1052         MSR_KVM_PV_EOI_EN,
1053
1054         MSR_IA32_TSC_ADJUST,
1055         MSR_IA32_TSCDEADLINE,
1056         MSR_IA32_MISC_ENABLE,
1057         MSR_IA32_MCG_STATUS,
1058         MSR_IA32_MCG_CTL,
1059         MSR_IA32_MCG_EXT_CTL,
1060         MSR_IA32_SMBASE,
1061         MSR_SMI_COUNT,
1062         MSR_PLATFORM_INFO,
1063         MSR_MISC_FEATURES_ENABLES,
1064 };
1065
1066 static unsigned num_emulated_msrs;
1067
1068 /*
1069  * List of msr numbers which are used to expose MSR-based features that
1070  * can be used by a hypervisor to validate requested CPU features.
1071  */
1072 static u32 msr_based_features[] = {
1073         MSR_IA32_VMX_BASIC,
1074         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1075         MSR_IA32_VMX_PINBASED_CTLS,
1076         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1077         MSR_IA32_VMX_PROCBASED_CTLS,
1078         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1079         MSR_IA32_VMX_EXIT_CTLS,
1080         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1081         MSR_IA32_VMX_ENTRY_CTLS,
1082         MSR_IA32_VMX_MISC,
1083         MSR_IA32_VMX_CR0_FIXED0,
1084         MSR_IA32_VMX_CR0_FIXED1,
1085         MSR_IA32_VMX_CR4_FIXED0,
1086         MSR_IA32_VMX_CR4_FIXED1,
1087         MSR_IA32_VMX_VMCS_ENUM,
1088         MSR_IA32_VMX_PROCBASED_CTLS2,
1089         MSR_IA32_VMX_EPT_VPID_CAP,
1090         MSR_IA32_VMX_VMFUNC,
1091
1092         MSR_F10H_DECFG,
1093         MSR_IA32_UCODE_REV,
1094 };
1095
1096 static unsigned int num_msr_based_features;
1097
1098 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1099 {
1100         switch (msr->index) {
1101         case MSR_IA32_UCODE_REV:
1102                 rdmsrl(msr->index, msr->data);
1103                 break;
1104         default:
1105                 if (kvm_x86_ops->get_msr_feature(msr))
1106                         return 1;
1107         }
1108         return 0;
1109 }
1110
1111 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1112 {
1113         struct kvm_msr_entry msr;
1114         int r;
1115
1116         msr.index = index;
1117         r = kvm_get_msr_feature(&msr);
1118         if (r)
1119                 return r;
1120
1121         *data = msr.data;
1122
1123         return 0;
1124 }
1125
1126 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1127 {
1128         if (efer & efer_reserved_bits)
1129                 return false;
1130
1131         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1132                         return false;
1133
1134         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1135                         return false;
1136
1137         return true;
1138 }
1139 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1140
1141 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
1142 {
1143         u64 old_efer = vcpu->arch.efer;
1144
1145         if (!kvm_valid_efer(vcpu, efer))
1146                 return 1;
1147
1148         if (is_paging(vcpu)
1149             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1150                 return 1;
1151
1152         efer &= ~EFER_LMA;
1153         efer |= vcpu->arch.efer & EFER_LMA;
1154
1155         kvm_x86_ops->set_efer(vcpu, efer);
1156
1157         /* Update reserved bits */
1158         if ((efer ^ old_efer) & EFER_NX)
1159                 kvm_mmu_reset_context(vcpu);
1160
1161         return 0;
1162 }
1163
1164 void kvm_enable_efer_bits(u64 mask)
1165 {
1166        efer_reserved_bits &= ~mask;
1167 }
1168 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1169
1170 /*
1171  * Writes msr value into into the appropriate "register".
1172  * Returns 0 on success, non-0 otherwise.
1173  * Assumes vcpu_load() was already called.
1174  */
1175 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1176 {
1177         switch (msr->index) {
1178         case MSR_FS_BASE:
1179         case MSR_GS_BASE:
1180         case MSR_KERNEL_GS_BASE:
1181         case MSR_CSTAR:
1182         case MSR_LSTAR:
1183                 if (is_noncanonical_address(msr->data, vcpu))
1184                         return 1;
1185                 break;
1186         case MSR_IA32_SYSENTER_EIP:
1187         case MSR_IA32_SYSENTER_ESP:
1188                 /*
1189                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1190                  * non-canonical address is written on Intel but not on
1191                  * AMD (which ignores the top 32-bits, because it does
1192                  * not implement 64-bit SYSENTER).
1193                  *
1194                  * 64-bit code should hence be able to write a non-canonical
1195                  * value on AMD.  Making the address canonical ensures that
1196                  * vmentry does not fail on Intel after writing a non-canonical
1197                  * value, and that something deterministic happens if the guest
1198                  * invokes 64-bit SYSENTER.
1199                  */
1200                 msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1201         }
1202         return kvm_x86_ops->set_msr(vcpu, msr);
1203 }
1204 EXPORT_SYMBOL_GPL(kvm_set_msr);
1205
1206 /*
1207  * Adapt set_msr() to msr_io()'s calling convention
1208  */
1209 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1210 {
1211         struct msr_data msr;
1212         int r;
1213
1214         msr.index = index;
1215         msr.host_initiated = true;
1216         r = kvm_get_msr(vcpu, &msr);
1217         if (r)
1218                 return r;
1219
1220         *data = msr.data;
1221         return 0;
1222 }
1223
1224 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1225 {
1226         struct msr_data msr;
1227
1228         msr.data = *data;
1229         msr.index = index;
1230         msr.host_initiated = true;
1231         return kvm_set_msr(vcpu, &msr);
1232 }
1233
1234 #ifdef CONFIG_X86_64
1235 struct pvclock_gtod_data {
1236         seqcount_t      seq;
1237
1238         struct { /* extract of a clocksource struct */
1239                 int vclock_mode;
1240                 u64     cycle_last;
1241                 u64     mask;
1242                 u32     mult;
1243                 u32     shift;
1244         } clock;
1245
1246         u64             boot_ns;
1247         u64             nsec_base;
1248         u64             wall_time_sec;
1249 };
1250
1251 static struct pvclock_gtod_data pvclock_gtod_data;
1252
1253 static void update_pvclock_gtod(struct timekeeper *tk)
1254 {
1255         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1256         u64 boot_ns;
1257
1258         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1259
1260         write_seqcount_begin(&vdata->seq);
1261
1262         /* copy pvclock gtod data */
1263         vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
1264         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1265         vdata->clock.mask               = tk->tkr_mono.mask;
1266         vdata->clock.mult               = tk->tkr_mono.mult;
1267         vdata->clock.shift              = tk->tkr_mono.shift;
1268
1269         vdata->boot_ns                  = boot_ns;
1270         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
1271
1272         vdata->wall_time_sec            = tk->xtime_sec;
1273
1274         write_seqcount_end(&vdata->seq);
1275 }
1276 #endif
1277
1278 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1279 {
1280         /*
1281          * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1282          * vcpu_enter_guest.  This function is only called from
1283          * the physical CPU that is running vcpu.
1284          */
1285         kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1286 }
1287
1288 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1289 {
1290         int version;
1291         int r;
1292         struct pvclock_wall_clock wc;
1293         struct timespec64 boot;
1294
1295         if (!wall_clock)
1296                 return;
1297
1298         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1299         if (r)
1300                 return;
1301
1302         if (version & 1)
1303                 ++version;  /* first time write, random junk */
1304
1305         ++version;
1306
1307         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1308                 return;
1309
1310         /*
1311          * The guest calculates current wall clock time by adding
1312          * system time (updated by kvm_guest_time_update below) to the
1313          * wall clock specified here.  guest system time equals host
1314          * system time for us, thus we must fill in host boot time here.
1315          */
1316         getboottime64(&boot);
1317
1318         if (kvm->arch.kvmclock_offset) {
1319                 struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1320                 boot = timespec64_sub(boot, ts);
1321         }
1322         wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1323         wc.nsec = boot.tv_nsec;
1324         wc.version = version;
1325
1326         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1327
1328         version++;
1329         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1330 }
1331
1332 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1333 {
1334         do_shl32_div32(dividend, divisor);
1335         return dividend;
1336 }
1337
1338 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1339                                s8 *pshift, u32 *pmultiplier)
1340 {
1341         uint64_t scaled64;
1342         int32_t  shift = 0;
1343         uint64_t tps64;
1344         uint32_t tps32;
1345
1346         tps64 = base_hz;
1347         scaled64 = scaled_hz;
1348         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1349                 tps64 >>= 1;
1350                 shift--;
1351         }
1352
1353         tps32 = (uint32_t)tps64;
1354         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1355                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1356                         scaled64 >>= 1;
1357                 else
1358                         tps32 <<= 1;
1359                 shift++;
1360         }
1361
1362         *pshift = shift;
1363         *pmultiplier = div_frac(scaled64, tps32);
1364
1365         pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1366                  __func__, base_hz, scaled_hz, shift, *pmultiplier);
1367 }
1368
1369 #ifdef CONFIG_X86_64
1370 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1371 #endif
1372
1373 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1374 static unsigned long max_tsc_khz;
1375
1376 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1377 {
1378         u64 v = (u64)khz * (1000000 + ppm);
1379         do_div(v, 1000000);
1380         return v;
1381 }
1382
1383 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1384 {
1385         u64 ratio;
1386
1387         /* Guest TSC same frequency as host TSC? */
1388         if (!scale) {
1389                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1390                 return 0;
1391         }
1392
1393         /* TSC scaling supported? */
1394         if (!kvm_has_tsc_control) {
1395                 if (user_tsc_khz > tsc_khz) {
1396                         vcpu->arch.tsc_catchup = 1;
1397                         vcpu->arch.tsc_always_catchup = 1;
1398                         return 0;
1399                 } else {
1400                         WARN(1, "user requested TSC rate below hardware speed\n");
1401                         return -1;
1402                 }
1403         }
1404
1405         /* TSC scaling required  - calculate ratio */
1406         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
1407                                 user_tsc_khz, tsc_khz);
1408
1409         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
1410                 WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
1411                           user_tsc_khz);
1412                 return -1;
1413         }
1414
1415         vcpu->arch.tsc_scaling_ratio = ratio;
1416         return 0;
1417 }
1418
1419 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1420 {
1421         u32 thresh_lo, thresh_hi;
1422         int use_scaling = 0;
1423
1424         /* tsc_khz can be zero if TSC calibration fails */
1425         if (user_tsc_khz == 0) {
1426                 /* set tsc_scaling_ratio to a safe value */
1427                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
1428                 return -1;
1429         }
1430
1431         /* Compute a scale to convert nanoseconds in TSC cycles */
1432         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
1433                            &vcpu->arch.virtual_tsc_shift,
1434                            &vcpu->arch.virtual_tsc_mult);
1435         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
1436
1437         /*
1438          * Compute the variation in TSC rate which is acceptable
1439          * within the range of tolerance and decide if the
1440          * rate being applied is within that bounds of the hardware
1441          * rate.  If so, no scaling or compensation need be done.
1442          */
1443         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1444         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1445         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
1446                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
1447                 use_scaling = 1;
1448         }
1449         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
1450 }
1451
1452 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1453 {
1454         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1455                                       vcpu->arch.virtual_tsc_mult,
1456                                       vcpu->arch.virtual_tsc_shift);
1457         tsc += vcpu->arch.this_tsc_write;
1458         return tsc;
1459 }
1460
1461 static inline int gtod_is_based_on_tsc(int mode)
1462 {
1463         return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
1464 }
1465
1466 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1467 {
1468 #ifdef CONFIG_X86_64
1469         bool vcpus_matched;
1470         struct kvm_arch *ka = &vcpu->kvm->arch;
1471         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1472
1473         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1474                          atomic_read(&vcpu->kvm->online_vcpus));
1475
1476         /*
1477          * Once the masterclock is enabled, always perform request in
1478          * order to update it.
1479          *
1480          * In order to enable masterclock, the host clocksource must be TSC
1481          * and the vcpus need to have matched TSCs.  When that happens,
1482          * perform request to enable masterclock.
1483          */
1484         if (ka->use_master_clock ||
1485             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
1486                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1487
1488         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1489                             atomic_read(&vcpu->kvm->online_vcpus),
1490                             ka->use_master_clock, gtod->clock.vclock_mode);
1491 #endif
1492 }
1493
1494 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1495 {
1496         u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1497         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1498 }
1499
1500 /*
1501  * Multiply tsc by a fixed point number represented by ratio.
1502  *
1503  * The most significant 64-N bits (mult) of ratio represent the
1504  * integral part of the fixed point number; the remaining N bits
1505  * (frac) represent the fractional part, ie. ratio represents a fixed
1506  * point number (mult + frac * 2^(-N)).
1507  *
1508  * N equals to kvm_tsc_scaling_ratio_frac_bits.
1509  */
1510 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
1511 {
1512         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
1513 }
1514
1515 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1516 {
1517         u64 _tsc = tsc;
1518         u64 ratio = vcpu->arch.tsc_scaling_ratio;
1519
1520         if (ratio != kvm_default_tsc_scaling_ratio)
1521                 _tsc = __scale_tsc(ratio, tsc);
1522
1523         return _tsc;
1524 }
1525 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
1526
1527 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1528 {
1529         u64 tsc;
1530
1531         tsc = kvm_scale_tsc(vcpu, rdtsc());
1532
1533         return target_tsc - tsc;
1534 }
1535
1536 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1537 {
1538         u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1539
1540         return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1541 }
1542 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1543
1544 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1545 {
1546         kvm_x86_ops->write_tsc_offset(vcpu, offset);
1547         vcpu->arch.tsc_offset = offset;
1548 }
1549
1550 static inline bool kvm_check_tsc_unstable(void)
1551 {
1552 #ifdef CONFIG_X86_64
1553         /*
1554          * TSC is marked unstable when we're running on Hyper-V,
1555          * 'TSC page' clocksource is good.
1556          */
1557         if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
1558                 return false;
1559 #endif
1560         return check_tsc_unstable();
1561 }
1562
1563 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1564 {
1565         struct kvm *kvm = vcpu->kvm;
1566         u64 offset, ns, elapsed;
1567         unsigned long flags;
1568         bool matched;
1569         bool already_matched;
1570         u64 data = msr->data;
1571         bool synchronizing = false;
1572
1573         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1574         offset = kvm_compute_tsc_offset(vcpu, data);
1575         ns = ktime_get_boot_ns();
1576         elapsed = ns - kvm->arch.last_tsc_nsec;
1577
1578         if (vcpu->arch.virtual_tsc_khz) {
1579                 if (data == 0 && msr->host_initiated) {
1580                         /*
1581                          * detection of vcpu initialization -- need to sync
1582                          * with other vCPUs. This particularly helps to keep
1583                          * kvm_clock stable after CPU hotplug
1584                          */
1585                         synchronizing = true;
1586                 } else {
1587                         u64 tsc_exp = kvm->arch.last_tsc_write +
1588                                                 nsec_to_cycles(vcpu, elapsed);
1589                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
1590                         /*
1591                          * Special case: TSC write with a small delta (1 second)
1592                          * of virtual cycle time against real time is
1593                          * interpreted as an attempt to synchronize the CPU.
1594                          */
1595                         synchronizing = data < tsc_exp + tsc_hz &&
1596                                         data + tsc_hz > tsc_exp;
1597                 }
1598         }
1599
1600         /*
1601          * For a reliable TSC, we can match TSC offsets, and for an unstable
1602          * TSC, we add elapsed time in this computation.  We could let the
1603          * compensation code attempt to catch up if we fall behind, but
1604          * it's better to try to match offsets from the beginning.
1605          */
1606         if (synchronizing &&
1607             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1608                 if (!kvm_check_tsc_unstable()) {
1609                         offset = kvm->arch.cur_tsc_offset;
1610                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1611                 } else {
1612                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1613                         data += delta;
1614                         offset = kvm_compute_tsc_offset(vcpu, data);
1615                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1616                 }
1617                 matched = true;
1618                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1619         } else {
1620                 /*
1621                  * We split periods of matched TSC writes into generations.
1622                  * For each generation, we track the original measured
1623                  * nanosecond time, offset, and write, so if TSCs are in
1624                  * sync, we can match exact offset, and if not, we can match
1625                  * exact software computation in compute_guest_tsc()
1626                  *
1627                  * These values are tracked in kvm->arch.cur_xxx variables.
1628                  */
1629                 kvm->arch.cur_tsc_generation++;
1630                 kvm->arch.cur_tsc_nsec = ns;
1631                 kvm->arch.cur_tsc_write = data;
1632                 kvm->arch.cur_tsc_offset = offset;
1633                 matched = false;
1634                 pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1635                          kvm->arch.cur_tsc_generation, data);
1636         }
1637
1638         /*
1639          * We also track th most recent recorded KHZ, write and time to
1640          * allow the matching interval to be extended at each write.
1641          */
1642         kvm->arch.last_tsc_nsec = ns;
1643         kvm->arch.last_tsc_write = data;
1644         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1645
1646         vcpu->arch.last_guest_tsc = data;
1647
1648         /* Keep track of which generation this VCPU has synchronized to */
1649         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1650         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1651         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1652
1653         if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1654                 update_ia32_tsc_adjust_msr(vcpu, offset);
1655
1656         kvm_vcpu_write_tsc_offset(vcpu, offset);
1657         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1658
1659         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1660         if (!matched) {
1661                 kvm->arch.nr_vcpus_matched_tsc = 0;
1662         } else if (!already_matched) {
1663                 kvm->arch.nr_vcpus_matched_tsc++;
1664         }
1665
1666         kvm_track_tsc_matching(vcpu);
1667         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1668 }
1669
1670 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1671
1672 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1673                                            s64 adjustment)
1674 {
1675         kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
1676 }
1677
1678 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
1679 {
1680         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
1681                 WARN_ON(adjustment < 0);
1682         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
1683         adjust_tsc_offset_guest(vcpu, adjustment);
1684 }
1685
1686 #ifdef CONFIG_X86_64
1687
1688 static u64 read_tsc(void)
1689 {
1690         u64 ret = (u64)rdtsc_ordered();
1691         u64 last = pvclock_gtod_data.clock.cycle_last;
1692
1693         if (likely(ret >= last))
1694                 return ret;
1695
1696         /*
1697          * GCC likes to generate cmov here, but this branch is extremely
1698          * predictable (it's just a function of time and the likely is
1699          * very likely) and there's a data dependence, so force GCC
1700          * to generate a branch instead.  I don't barrier() because
1701          * we don't actually need a barrier, and if this function
1702          * ever gets inlined it will generate worse code.
1703          */
1704         asm volatile ("");
1705         return last;
1706 }
1707
1708 static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
1709 {
1710         long v;
1711         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1712         u64 tsc_pg_val;
1713
1714         switch (gtod->clock.vclock_mode) {
1715         case VCLOCK_HVCLOCK:
1716                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1717                                                   tsc_timestamp);
1718                 if (tsc_pg_val != U64_MAX) {
1719                         /* TSC page valid */
1720                         *mode = VCLOCK_HVCLOCK;
1721                         v = (tsc_pg_val - gtod->clock.cycle_last) &
1722                                 gtod->clock.mask;
1723                 } else {
1724                         /* TSC page invalid */
1725                         *mode = VCLOCK_NONE;
1726                 }
1727                 break;
1728         case VCLOCK_TSC:
1729                 *mode = VCLOCK_TSC;
1730                 *tsc_timestamp = read_tsc();
1731                 v = (*tsc_timestamp - gtod->clock.cycle_last) &
1732                         gtod->clock.mask;
1733                 break;
1734         default:
1735                 *mode = VCLOCK_NONE;
1736         }
1737
1738         if (*mode == VCLOCK_NONE)
1739                 *tsc_timestamp = v = 0;
1740
1741         return v * gtod->clock.mult;
1742 }
1743
1744 static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
1745 {
1746         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1747         unsigned long seq;
1748         int mode;
1749         u64 ns;
1750
1751         do {
1752                 seq = read_seqcount_begin(&gtod->seq);
1753                 ns = gtod->nsec_base;
1754                 ns += vgettsc(tsc_timestamp, &mode);
1755                 ns >>= gtod->clock.shift;
1756                 ns += gtod->boot_ns;
1757         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1758         *t = ns;
1759
1760         return mode;
1761 }
1762
1763 static int do_realtime(struct timespec *ts, u64 *tsc_timestamp)
1764 {
1765         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1766         unsigned long seq;
1767         int mode;
1768         u64 ns;
1769
1770         do {
1771                 seq = read_seqcount_begin(&gtod->seq);
1772                 ts->tv_sec = gtod->wall_time_sec;
1773                 ns = gtod->nsec_base;
1774                 ns += vgettsc(tsc_timestamp, &mode);
1775                 ns >>= gtod->clock.shift;
1776         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1777
1778         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1779         ts->tv_nsec = ns;
1780
1781         return mode;
1782 }
1783
1784 /* returns true if host is using TSC based clocksource */
1785 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
1786 {
1787         /* checked again under seqlock below */
1788         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1789                 return false;
1790
1791         return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
1792                                                       tsc_timestamp));
1793 }
1794
1795 /* returns true if host is using TSC based clocksource */
1796 static bool kvm_get_walltime_and_clockread(struct timespec *ts,
1797                                            u64 *tsc_timestamp)
1798 {
1799         /* checked again under seqlock below */
1800         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1801                 return false;
1802
1803         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
1804 }
1805 #endif
1806
1807 /*
1808  *
1809  * Assuming a stable TSC across physical CPUS, and a stable TSC
1810  * across virtual CPUs, the following condition is possible.
1811  * Each numbered line represents an event visible to both
1812  * CPUs at the next numbered event.
1813  *
1814  * "timespecX" represents host monotonic time. "tscX" represents
1815  * RDTSC value.
1816  *
1817  *              VCPU0 on CPU0           |       VCPU1 on CPU1
1818  *
1819  * 1.  read timespec0,tsc0
1820  * 2.                                   | timespec1 = timespec0 + N
1821  *                                      | tsc1 = tsc0 + M
1822  * 3. transition to guest               | transition to guest
1823  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1824  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1825  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1826  *
1827  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1828  *
1829  *      - ret0 < ret1
1830  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1831  *              ...
1832  *      - 0 < N - M => M < N
1833  *
1834  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1835  * always the case (the difference between two distinct xtime instances
1836  * might be smaller then the difference between corresponding TSC reads,
1837  * when updating guest vcpus pvclock areas).
1838  *
1839  * To avoid that problem, do not allow visibility of distinct
1840  * system_timestamp/tsc_timestamp values simultaneously: use a master
1841  * copy of host monotonic time values. Update that master copy
1842  * in lockstep.
1843  *
1844  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1845  *
1846  */
1847
1848 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1849 {
1850 #ifdef CONFIG_X86_64
1851         struct kvm_arch *ka = &kvm->arch;
1852         int vclock_mode;
1853         bool host_tsc_clocksource, vcpus_matched;
1854
1855         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1856                         atomic_read(&kvm->online_vcpus));
1857
1858         /*
1859          * If the host uses TSC clock, then passthrough TSC as stable
1860          * to the guest.
1861          */
1862         host_tsc_clocksource = kvm_get_time_and_clockread(
1863                                         &ka->master_kernel_ns,
1864                                         &ka->master_cycle_now);
1865
1866         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1867                                 && !ka->backwards_tsc_observed
1868                                 && !ka->boot_vcpu_runs_old_kvmclock;
1869
1870         if (ka->use_master_clock)
1871                 atomic_set(&kvm_guest_has_master_clock, 1);
1872
1873         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1874         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1875                                         vcpus_matched);
1876 #endif
1877 }
1878
1879 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
1880 {
1881         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
1882 }
1883
1884 static void kvm_gen_update_masterclock(struct kvm *kvm)
1885 {
1886 #ifdef CONFIG_X86_64
1887         int i;
1888         struct kvm_vcpu *vcpu;
1889         struct kvm_arch *ka = &kvm->arch;
1890
1891         spin_lock(&ka->pvclock_gtod_sync_lock);
1892         kvm_make_mclock_inprogress_request(kvm);
1893         /* no guest entries from this point */
1894         pvclock_update_vm_gtod_copy(kvm);
1895
1896         kvm_for_each_vcpu(i, vcpu, kvm)
1897                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1898
1899         /* guest entries allowed */
1900         kvm_for_each_vcpu(i, vcpu, kvm)
1901                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
1902
1903         spin_unlock(&ka->pvclock_gtod_sync_lock);
1904 #endif
1905 }
1906
1907 u64 get_kvmclock_ns(struct kvm *kvm)
1908 {
1909         struct kvm_arch *ka = &kvm->arch;
1910         struct pvclock_vcpu_time_info hv_clock;
1911         u64 ret;
1912
1913         spin_lock(&ka->pvclock_gtod_sync_lock);
1914         if (!ka->use_master_clock) {
1915                 spin_unlock(&ka->pvclock_gtod_sync_lock);
1916                 return ktime_get_boot_ns() + ka->kvmclock_offset;
1917         }
1918
1919         hv_clock.tsc_timestamp = ka->master_cycle_now;
1920         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
1921         spin_unlock(&ka->pvclock_gtod_sync_lock);
1922
1923         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
1924         get_cpu();
1925
1926         if (__this_cpu_read(cpu_tsc_khz)) {
1927                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
1928                                    &hv_clock.tsc_shift,
1929                                    &hv_clock.tsc_to_system_mul);
1930                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
1931         } else
1932                 ret = ktime_get_boot_ns() + ka->kvmclock_offset;
1933
1934         put_cpu();
1935
1936         return ret;
1937 }
1938
1939 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1940 {
1941         struct kvm_vcpu_arch *vcpu = &v->arch;
1942         struct pvclock_vcpu_time_info guest_hv_clock;
1943
1944         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1945                 &guest_hv_clock, sizeof(guest_hv_clock))))
1946                 return;
1947
1948         /* This VCPU is paused, but it's legal for a guest to read another
1949          * VCPU's kvmclock, so we really have to follow the specification where
1950          * it says that version is odd if data is being modified, and even after
1951          * it is consistent.
1952          *
1953          * Version field updates must be kept separate.  This is because
1954          * kvm_write_guest_cached might use a "rep movs" instruction, and
1955          * writes within a string instruction are weakly ordered.  So there
1956          * are three writes overall.
1957          *
1958          * As a small optimization, only write the version field in the first
1959          * and third write.  The vcpu->pv_time cache is still valid, because the
1960          * version field is the first in the struct.
1961          */
1962         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
1963
1964         if (guest_hv_clock.version & 1)
1965                 ++guest_hv_clock.version;  /* first time write, random junk */
1966
1967         vcpu->hv_clock.version = guest_hv_clock.version + 1;
1968         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1969                                 &vcpu->hv_clock,
1970                                 sizeof(vcpu->hv_clock.version));
1971
1972         smp_wmb();
1973
1974         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1975         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
1976
1977         if (vcpu->pvclock_set_guest_stopped_request) {
1978                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
1979                 vcpu->pvclock_set_guest_stopped_request = false;
1980         }
1981
1982         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
1983
1984         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1985                                 &vcpu->hv_clock,
1986                                 sizeof(vcpu->hv_clock));
1987
1988         smp_wmb();
1989
1990         vcpu->hv_clock.version++;
1991         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1992                                 &vcpu->hv_clock,
1993                                 sizeof(vcpu->hv_clock.version));
1994 }
1995
1996 static int kvm_guest_time_update(struct kvm_vcpu *v)
1997 {
1998         unsigned long flags, tgt_tsc_khz;
1999         struct kvm_vcpu_arch *vcpu = &v->arch;
2000         struct kvm_arch *ka = &v->kvm->arch;
2001         s64 kernel_ns;
2002         u64 tsc_timestamp, host_tsc;
2003         u8 pvclock_flags;
2004         bool use_master_clock;
2005
2006         kernel_ns = 0;
2007         host_tsc = 0;
2008
2009         /*
2010          * If the host uses TSC clock, then passthrough TSC as stable
2011          * to the guest.
2012          */
2013         spin_lock(&ka->pvclock_gtod_sync_lock);
2014         use_master_clock = ka->use_master_clock;
2015         if (use_master_clock) {
2016                 host_tsc = ka->master_cycle_now;
2017                 kernel_ns = ka->master_kernel_ns;
2018         }
2019         spin_unlock(&ka->pvclock_gtod_sync_lock);
2020
2021         /* Keep irq disabled to prevent changes to the clock */
2022         local_irq_save(flags);
2023         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2024         if (unlikely(tgt_tsc_khz == 0)) {
2025                 local_irq_restore(flags);
2026                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2027                 return 1;
2028         }
2029         if (!use_master_clock) {
2030                 host_tsc = rdtsc();
2031                 kernel_ns = ktime_get_boot_ns();
2032         }
2033
2034         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2035
2036         /*
2037          * We may have to catch up the TSC to match elapsed wall clock
2038          * time for two reasons, even if kvmclock is used.
2039          *   1) CPU could have been running below the maximum TSC rate
2040          *   2) Broken TSC compensation resets the base at each VCPU
2041          *      entry to avoid unknown leaps of TSC even when running
2042          *      again on the same CPU.  This may cause apparent elapsed
2043          *      time to disappear, and the guest to stand still or run
2044          *      very slowly.
2045          */
2046         if (vcpu->tsc_catchup) {
2047                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2048                 if (tsc > tsc_timestamp) {
2049                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2050                         tsc_timestamp = tsc;
2051                 }
2052         }
2053
2054         local_irq_restore(flags);
2055
2056         /* With all the info we got, fill in the values */
2057
2058         if (kvm_has_tsc_control)
2059                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2060
2061         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2062                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2063                                    &vcpu->hv_clock.tsc_shift,
2064                                    &vcpu->hv_clock.tsc_to_system_mul);
2065                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2066         }
2067
2068         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2069         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2070         vcpu->last_guest_tsc = tsc_timestamp;
2071
2072         /* If the host uses TSC clocksource, then it is stable */
2073         pvclock_flags = 0;
2074         if (use_master_clock)
2075                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2076
2077         vcpu->hv_clock.flags = pvclock_flags;
2078
2079         if (vcpu->pv_time_enabled)
2080                 kvm_setup_pvclock_page(v);
2081         if (v == kvm_get_vcpu(v->kvm, 0))
2082                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2083         return 0;
2084 }
2085
2086 /*
2087  * kvmclock updates which are isolated to a given vcpu, such as
2088  * vcpu->cpu migration, should not allow system_timestamp from
2089  * the rest of the vcpus to remain static. Otherwise ntp frequency
2090  * correction applies to one vcpu's system_timestamp but not
2091  * the others.
2092  *
2093  * So in those cases, request a kvmclock update for all vcpus.
2094  * We need to rate-limit these requests though, as they can
2095  * considerably slow guests that have a large number of vcpus.
2096  * The time for a remote vcpu to update its kvmclock is bound
2097  * by the delay we use to rate-limit the updates.
2098  */
2099
2100 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2101
2102 static void kvmclock_update_fn(struct work_struct *work)
2103 {
2104         int i;
2105         struct delayed_work *dwork = to_delayed_work(work);
2106         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2107                                            kvmclock_update_work);
2108         struct kvm *kvm = container_of(ka, struct kvm, arch);
2109         struct kvm_vcpu *vcpu;
2110
2111         kvm_for_each_vcpu(i, vcpu, kvm) {
2112                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2113                 kvm_vcpu_kick(vcpu);
2114         }
2115 }
2116
2117 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2118 {
2119         struct kvm *kvm = v->kvm;
2120
2121         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2122         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2123                                         KVMCLOCK_UPDATE_DELAY);
2124 }
2125
2126 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2127
2128 static void kvmclock_sync_fn(struct work_struct *work)
2129 {
2130         struct delayed_work *dwork = to_delayed_work(work);
2131         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2132                                            kvmclock_sync_work);
2133         struct kvm *kvm = container_of(ka, struct kvm, arch);
2134
2135         if (!kvmclock_periodic_sync)
2136                 return;
2137
2138         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2139         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2140                                         KVMCLOCK_SYNC_PERIOD);
2141 }
2142
2143 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2144 {
2145         u64 mcg_cap = vcpu->arch.mcg_cap;
2146         unsigned bank_num = mcg_cap & 0xff;
2147         u32 msr = msr_info->index;
2148         u64 data = msr_info->data;
2149
2150         switch (msr) {
2151         case MSR_IA32_MCG_STATUS:
2152                 vcpu->arch.mcg_status = data;
2153                 break;
2154         case MSR_IA32_MCG_CTL:
2155                 if (!(mcg_cap & MCG_CTL_P))
2156                         return 1;
2157                 if (data != 0 && data != ~(u64)0)
2158                         return -1;
2159                 vcpu->arch.mcg_ctl = data;
2160                 break;
2161         default:
2162                 if (msr >= MSR_IA32_MC0_CTL &&
2163                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2164                         u32 offset = msr - MSR_IA32_MC0_CTL;
2165                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2166                          * some Linux kernels though clear bit 10 in bank 4 to
2167                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2168                          * this to avoid an uncatched #GP in the guest
2169                          */
2170                         if ((offset & 0x3) == 0 &&
2171                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2172                                 return -1;
2173                         if (!msr_info->host_initiated &&
2174                                 (offset & 0x3) == 1 && data != 0)
2175                                 return -1;
2176                         vcpu->arch.mce_banks[offset] = data;
2177                         break;
2178                 }
2179                 return 1;
2180         }
2181         return 0;
2182 }
2183
2184 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2185 {
2186         struct kvm *kvm = vcpu->kvm;
2187         int lm = is_long_mode(vcpu);
2188         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2189                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2190         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2191                 : kvm->arch.xen_hvm_config.blob_size_32;
2192         u32 page_num = data & ~PAGE_MASK;
2193         u64 page_addr = data & PAGE_MASK;
2194         u8 *page;
2195         int r;
2196
2197         r = -E2BIG;
2198         if (page_num >= blob_size)
2199                 goto out;
2200         r = -ENOMEM;
2201         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2202         if (IS_ERR(page)) {
2203                 r = PTR_ERR(page);
2204                 goto out;
2205         }
2206         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2207                 goto out_free;
2208         r = 0;
2209 out_free:
2210         kfree(page);
2211 out:
2212         return r;
2213 }
2214
2215 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2216 {
2217         gpa_t gpa = data & ~0x3f;
2218
2219         /* Bits 3:5 are reserved, Should be zero */
2220         if (data & 0x38)
2221                 return 1;
2222
2223         vcpu->arch.apf.msr_val = data;
2224
2225         if (!(data & KVM_ASYNC_PF_ENABLED)) {
2226                 kvm_clear_async_pf_completion_queue(vcpu);
2227                 kvm_async_pf_hash_reset(vcpu);
2228                 return 0;
2229         }
2230
2231         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2232                                         sizeof(u32)))
2233                 return 1;
2234
2235         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2236         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2237         kvm_async_pf_wakeup_all(vcpu);
2238         return 0;
2239 }
2240
2241 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2242 {
2243         vcpu->arch.pv_time_enabled = false;
2244 }
2245
2246 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
2247 {
2248         ++vcpu->stat.tlb_flush;
2249         kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
2250 }
2251
2252 static void record_steal_time(struct kvm_vcpu *vcpu)
2253 {
2254         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2255                 return;
2256
2257         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2258                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2259                 return;
2260
2261         /*
2262          * Doing a TLB flush here, on the guest's behalf, can avoid
2263          * expensive IPIs.
2264          */
2265         if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
2266                 kvm_vcpu_flush_tlb(vcpu, false);
2267
2268         if (vcpu->arch.st.steal.version & 1)
2269                 vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
2270
2271         vcpu->arch.st.steal.version += 1;
2272
2273         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2274                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2275
2276         smp_wmb();
2277
2278         vcpu->arch.st.steal.steal += current->sched_info.run_delay -
2279                 vcpu->arch.st.last_steal;
2280         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2281
2282         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2283                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2284
2285         smp_wmb();
2286
2287         vcpu->arch.st.steal.version += 1;
2288
2289         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2290                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2291 }
2292
2293 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2294 {
2295         bool pr = false;
2296         u32 msr = msr_info->index;
2297         u64 data = msr_info->data;
2298
2299         switch (msr) {
2300         case MSR_AMD64_NB_CFG:
2301         case MSR_IA32_UCODE_WRITE:
2302         case MSR_VM_HSAVE_PA:
2303         case MSR_AMD64_PATCH_LOADER:
2304         case MSR_AMD64_BU_CFG2:
2305         case MSR_AMD64_DC_CFG:
2306                 break;
2307
2308         case MSR_IA32_UCODE_REV:
2309                 if (msr_info->host_initiated)
2310                         vcpu->arch.microcode_version = data;
2311                 break;
2312         case MSR_EFER:
2313                 return set_efer(vcpu, data);
2314         case MSR_K7_HWCR:
2315                 data &= ~(u64)0x40;     /* ignore flush filter disable */
2316                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2317                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
2318                 data &= ~(u64)0x40000;  /* ignore Mc status write enable */
2319                 if (data != 0) {
2320                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2321                                     data);
2322                         return 1;
2323                 }
2324                 break;
2325         case MSR_FAM10H_MMIO_CONF_BASE:
2326                 if (data != 0) {
2327                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2328                                     "0x%llx\n", data);
2329                         return 1;
2330                 }
2331                 break;
2332         case MSR_IA32_DEBUGCTLMSR:
2333                 if (!data) {
2334                         /* We support the non-activated case already */
2335                         break;
2336                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2337                         /* Values other than LBR and BTF are vendor-specific,
2338                            thus reserved and should throw a #GP */
2339                         return 1;
2340                 }
2341                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2342                             __func__, data);
2343                 break;
2344         case 0x200 ... 0x2ff:
2345                 return kvm_mtrr_set_msr(vcpu, msr, data);
2346         case MSR_IA32_APICBASE:
2347                 return kvm_set_apic_base(vcpu, msr_info);
2348         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2349                 return kvm_x2apic_msr_write(vcpu, msr, data);
2350         case MSR_IA32_TSCDEADLINE:
2351                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
2352                 break;
2353         case MSR_IA32_TSC_ADJUST:
2354                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
2355                         if (!msr_info->host_initiated) {
2356                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2357                                 adjust_tsc_offset_guest(vcpu, adj);
2358                         }
2359                         vcpu->arch.ia32_tsc_adjust_msr = data;
2360                 }
2361                 break;
2362         case MSR_IA32_MISC_ENABLE:
2363                 vcpu->arch.ia32_misc_enable_msr = data;
2364                 break;
2365         case MSR_IA32_SMBASE:
2366                 if (!msr_info->host_initiated)
2367                         return 1;
2368                 vcpu->arch.smbase = data;
2369                 break;
2370         case MSR_IA32_TSC:
2371                 kvm_write_tsc(vcpu, msr_info);
2372                 break;
2373         case MSR_SMI_COUNT:
2374                 if (!msr_info->host_initiated)
2375                         return 1;
2376                 vcpu->arch.smi_count = data;
2377                 break;
2378         case MSR_KVM_WALL_CLOCK_NEW:
2379         case MSR_KVM_WALL_CLOCK:
2380                 vcpu->kvm->arch.wall_clock = data;
2381                 kvm_write_wall_clock(vcpu->kvm, data);
2382                 break;
2383         case MSR_KVM_SYSTEM_TIME_NEW:
2384         case MSR_KVM_SYSTEM_TIME: {
2385                 struct kvm_arch *ka = &vcpu->kvm->arch;
2386
2387                 kvmclock_reset(vcpu);
2388
2389                 if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2390                         bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2391
2392                         if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2393                                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2394
2395                         ka->boot_vcpu_runs_old_kvmclock = tmp;
2396                 }
2397
2398                 vcpu->arch.time = data;
2399                 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2400
2401                 /* we verify if the enable bit is set... */
2402                 if (!(data & 1))
2403                         break;
2404
2405                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2406                      &vcpu->arch.pv_time, data & ~1ULL,
2407                      sizeof(struct pvclock_vcpu_time_info)))
2408                         vcpu->arch.pv_time_enabled = false;
2409                 else
2410                         vcpu->arch.pv_time_enabled = true;
2411
2412                 break;
2413         }
2414         case MSR_KVM_ASYNC_PF_EN:
2415                 if (kvm_pv_enable_async_pf(vcpu, data))
2416                         return 1;
2417                 break;
2418         case MSR_KVM_STEAL_TIME:
2419
2420                 if (unlikely(!sched_info_on()))
2421                         return 1;
2422
2423                 if (data & KVM_STEAL_RESERVED_MASK)
2424                         return 1;
2425
2426                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2427                                                 data & KVM_STEAL_VALID_BITS,
2428                                                 sizeof(struct kvm_steal_time)))
2429                         return 1;
2430
2431                 vcpu->arch.st.msr_val = data;
2432
2433                 if (!(data & KVM_MSR_ENABLED))
2434                         break;
2435
2436                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2437
2438                 break;
2439         case MSR_KVM_PV_EOI_EN:
2440                 if (kvm_lapic_enable_pv_eoi(vcpu, data))
2441                         return 1;
2442                 break;
2443
2444         case MSR_IA32_MCG_CTL:
2445         case MSR_IA32_MCG_STATUS:
2446         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2447                 return set_msr_mce(vcpu, msr_info);
2448
2449         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2450         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2451                 pr = true; /* fall through */
2452         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2453         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2454                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2455                         return kvm_pmu_set_msr(vcpu, msr_info);
2456
2457                 if (pr || data != 0)
2458                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2459                                     "0x%x data 0x%llx\n", msr, data);
2460                 break;
2461         case MSR_K7_CLK_CTL:
2462                 /*
2463                  * Ignore all writes to this no longer documented MSR.
2464                  * Writes are only relevant for old K7 processors,
2465                  * all pre-dating SVM, but a recommended workaround from
2466                  * AMD for these chips. It is possible to specify the
2467                  * affected processor models on the command line, hence
2468                  * the need to ignore the workaround.
2469                  */
2470                 break;
2471         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2472         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2473         case HV_X64_MSR_CRASH_CTL:
2474         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2475         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2476         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2477         case HV_X64_MSR_TSC_EMULATION_STATUS:
2478                 return kvm_hv_set_msr_common(vcpu, msr, data,
2479                                              msr_info->host_initiated);
2480         case MSR_IA32_BBL_CR_CTL3:
2481                 /* Drop writes to this legacy MSR -- see rdmsr
2482                  * counterpart for further detail.
2483                  */
2484                 if (report_ignored_msrs)
2485                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
2486                                 msr, data);
2487                 break;
2488         case MSR_AMD64_OSVW_ID_LENGTH:
2489                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2490                         return 1;
2491                 vcpu->arch.osvw.length = data;
2492                 break;
2493         case MSR_AMD64_OSVW_STATUS:
2494                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2495                         return 1;
2496                 vcpu->arch.osvw.status = data;
2497                 break;
2498         case MSR_PLATFORM_INFO:
2499                 if (!msr_info->host_initiated ||
2500                     data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
2501                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
2502                      cpuid_fault_enabled(vcpu)))
2503                         return 1;
2504                 vcpu->arch.msr_platform_info = data;
2505                 break;
2506         case MSR_MISC_FEATURES_ENABLES:
2507                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
2508                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
2509                      !supports_cpuid_fault(vcpu)))
2510                         return 1;
2511                 vcpu->arch.msr_misc_features_enables = data;
2512                 break;
2513         default:
2514                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2515                         return xen_hvm_config(vcpu, data);
2516                 if (kvm_pmu_is_valid_msr(vcpu, msr))
2517                         return kvm_pmu_set_msr(vcpu, msr_info);
2518                 if (!ignore_msrs) {
2519                         vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2520                                     msr, data);
2521                         return 1;
2522                 } else {
2523                         if (report_ignored_msrs)
2524                                 vcpu_unimpl(vcpu,
2525                                         "ignored wrmsr: 0x%x data 0x%llx\n",
2526                                         msr, data);
2527                         break;
2528                 }
2529         }
2530         return 0;
2531 }
2532 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2533
2534
2535 /*
2536  * Reads an msr value (of 'msr_index') into 'pdata'.
2537  * Returns 0 on success, non-0 otherwise.
2538  * Assumes vcpu_load() was already called.
2539  */
2540 int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2541 {
2542         return kvm_x86_ops->get_msr(vcpu, msr);
2543 }
2544 EXPORT_SYMBOL_GPL(kvm_get_msr);
2545
2546 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2547 {
2548         u64 data;
2549         u64 mcg_cap = vcpu->arch.mcg_cap;
2550         unsigned bank_num = mcg_cap & 0xff;
2551
2552         switch (msr) {
2553         case MSR_IA32_P5_MC_ADDR:
2554         case MSR_IA32_P5_MC_TYPE:
2555                 data = 0;
2556                 break;
2557         case MSR_IA32_MCG_CAP:
2558                 data = vcpu->arch.mcg_cap;
2559                 break;
2560         case MSR_IA32_MCG_CTL:
2561                 if (!(mcg_cap & MCG_CTL_P))
2562                         return 1;
2563                 data = vcpu->arch.mcg_ctl;
2564                 break;
2565         case MSR_IA32_MCG_STATUS:
2566                 data = vcpu->arch.mcg_status;
2567                 break;
2568         default:
2569                 if (msr >= MSR_IA32_MC0_CTL &&
2570                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2571                         u32 offset = msr - MSR_IA32_MC0_CTL;
2572                         data = vcpu->arch.mce_banks[offset];
2573                         break;
2574                 }
2575                 return 1;
2576         }
2577         *pdata = data;
2578         return 0;
2579 }
2580
2581 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2582 {
2583         switch (msr_info->index) {
2584         case MSR_IA32_PLATFORM_ID:
2585         case MSR_IA32_EBL_CR_POWERON:
2586         case MSR_IA32_DEBUGCTLMSR:
2587         case MSR_IA32_LASTBRANCHFROMIP:
2588         case MSR_IA32_LASTBRANCHTOIP:
2589         case MSR_IA32_LASTINTFROMIP:
2590         case MSR_IA32_LASTINTTOIP:
2591         case MSR_K8_SYSCFG:
2592         case MSR_K8_TSEG_ADDR:
2593         case MSR_K8_TSEG_MASK:
2594         case MSR_K7_HWCR:
2595         case MSR_VM_HSAVE_PA:
2596         case MSR_K8_INT_PENDING_MSG:
2597         case MSR_AMD64_NB_CFG:
2598         case MSR_FAM10H_MMIO_CONF_BASE:
2599         case MSR_AMD64_BU_CFG2:
2600         case MSR_IA32_PERF_CTL:
2601         case MSR_AMD64_DC_CFG:
2602                 msr_info->data = 0;
2603                 break;
2604         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2605         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2606         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2607         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2608         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2609                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2610                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2611                 msr_info->data = 0;
2612                 break;
2613         case MSR_IA32_UCODE_REV:
2614                 msr_info->data = vcpu->arch.microcode_version;
2615                 break;
2616         case MSR_IA32_TSC:
2617                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2618                 break;
2619         case MSR_MTRRcap:
2620         case 0x200 ... 0x2ff:
2621                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
2622         case 0xcd: /* fsb frequency */
2623                 msr_info->data = 3;
2624                 break;
2625                 /*
2626                  * MSR_EBC_FREQUENCY_ID
2627                  * Conservative value valid for even the basic CPU models.
2628                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2629                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2630                  * and 266MHz for model 3, or 4. Set Core Clock
2631                  * Frequency to System Bus Frequency Ratio to 1 (bits
2632                  * 31:24) even though these are only valid for CPU
2633                  * models > 2, however guests may end up dividing or
2634                  * multiplying by zero otherwise.
2635                  */
2636         case MSR_EBC_FREQUENCY_ID:
2637                 msr_info->data = 1 << 24;
2638                 break;
2639         case MSR_IA32_APICBASE:
2640                 msr_info->data = kvm_get_apic_base(vcpu);
2641                 break;
2642         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2643                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2644                 break;
2645         case MSR_IA32_TSCDEADLINE:
2646                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2647                 break;
2648         case MSR_IA32_TSC_ADJUST:
2649                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2650                 break;
2651         case MSR_IA32_MISC_ENABLE:
2652                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
2653                 break;
2654         case MSR_IA32_SMBASE:
2655                 if (!msr_info->host_initiated)
2656                         return 1;
2657                 msr_info->data = vcpu->arch.smbase;
2658                 break;
2659         case MSR_SMI_COUNT:
2660                 msr_info->data = vcpu->arch.smi_count;
2661                 break;
2662         case MSR_IA32_PERF_STATUS:
2663                 /* TSC increment by tick */
2664                 msr_info->data = 1000ULL;
2665                 /* CPU multiplier */
2666                 msr_info->data |= (((uint64_t)4ULL) << 40);
2667                 break;
2668         case MSR_EFER:
2669                 msr_info->data = vcpu->arch.efer;
2670                 break;
2671         case MSR_KVM_WALL_CLOCK:
2672         case MSR_KVM_WALL_CLOCK_NEW:
2673                 msr_info->data = vcpu->kvm->arch.wall_clock;
2674                 break;
2675         case MSR_KVM_SYSTEM_TIME:
2676         case MSR_KVM_SYSTEM_TIME_NEW:
2677                 msr_info->data = vcpu->arch.time;
2678                 break;
2679         case MSR_KVM_ASYNC_PF_EN:
2680                 msr_info->data = vcpu->arch.apf.msr_val;
2681                 break;
2682         case MSR_KVM_STEAL_TIME:
2683                 msr_info->data = vcpu->arch.st.msr_val;
2684                 break;
2685         case MSR_KVM_PV_EOI_EN:
2686                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2687                 break;
2688         case MSR_IA32_P5_MC_ADDR:
2689         case MSR_IA32_P5_MC_TYPE:
2690         case MSR_IA32_MCG_CAP:
2691         case MSR_IA32_MCG_CTL:
2692         case MSR_IA32_MCG_STATUS:
2693         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2694                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
2695         case MSR_K7_CLK_CTL:
2696                 /*
2697                  * Provide expected ramp-up count for K7. All other
2698                  * are set to zero, indicating minimum divisors for
2699                  * every field.
2700                  *
2701                  * This prevents guest kernels on AMD host with CPU
2702                  * type 6, model 8 and higher from exploding due to
2703                  * the rdmsr failing.
2704                  */
2705                 msr_info->data = 0x20000000;
2706                 break;
2707         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2708         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2709         case HV_X64_MSR_CRASH_CTL:
2710         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2711         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2712         case HV_X64_MSR_TSC_EMULATION_CONTROL:
2713         case HV_X64_MSR_TSC_EMULATION_STATUS:
2714                 return kvm_hv_get_msr_common(vcpu,
2715                                              msr_info->index, &msr_info->data);
2716                 break;
2717         case MSR_IA32_BBL_CR_CTL3:
2718                 /* This legacy MSR exists but isn't fully documented in current
2719                  * silicon.  It is however accessed by winxp in very narrow
2720                  * scenarios where it sets bit #19, itself documented as
2721                  * a "reserved" bit.  Best effort attempt to source coherent
2722                  * read data here should the balance of the register be
2723                  * interpreted by the guest:
2724                  *
2725                  * L2 cache control register 3: 64GB range, 256KB size,
2726                  * enabled, latency 0x1, configured
2727                  */
2728                 msr_info->data = 0xbe702111;
2729                 break;
2730         case MSR_AMD64_OSVW_ID_LENGTH:
2731                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2732                         return 1;
2733                 msr_info->data = vcpu->arch.osvw.length;
2734                 break;
2735         case MSR_AMD64_OSVW_STATUS:
2736                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
2737                         return 1;
2738                 msr_info->data = vcpu->arch.osvw.status;
2739                 break;
2740         case MSR_PLATFORM_INFO:
2741                 msr_info->data = vcpu->arch.msr_platform_info;
2742                 break;
2743         case MSR_MISC_FEATURES_ENABLES:
2744                 msr_info->data = vcpu->arch.msr_misc_features_enables;
2745                 break;
2746         default:
2747                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2748                         return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2749                 if (!ignore_msrs) {
2750                         vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2751                                                msr_info->index);
2752                         return 1;
2753                 } else {
2754                         if (report_ignored_msrs)
2755                                 vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2756                                         msr_info->index);
2757                         msr_info->data = 0;
2758                 }
2759                 break;
2760         }
2761         return 0;
2762 }
2763 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2764
2765 /*
2766  * Read or write a bunch of msrs. All parameters are kernel addresses.
2767  *
2768  * @return number of msrs set successfully.
2769  */
2770 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2771                     struct kvm_msr_entry *entries,
2772                     int (*do_msr)(struct kvm_vcpu *vcpu,
2773                                   unsigned index, u64 *data))
2774 {
2775         int i;
2776
2777         for (i = 0; i < msrs->nmsrs; ++i)
2778                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2779                         break;
2780
2781         return i;
2782 }
2783
2784 /*
2785  * Read or write a bunch of msrs. Parameters are user addresses.
2786  *
2787  * @return number of msrs set successfully.
2788  */
2789 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2790                   int (*do_msr)(struct kvm_vcpu *vcpu,
2791                                 unsigned index, u64 *data),
2792                   int writeback)
2793 {
2794         struct kvm_msrs msrs;
2795         struct kvm_msr_entry *entries;
2796         int r, n;
2797         unsigned size;
2798
2799         r = -EFAULT;
2800         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2801                 goto out;
2802
2803         r = -E2BIG;
2804         if (msrs.nmsrs >= MAX_IO_MSRS)
2805                 goto out;
2806
2807         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2808         entries = memdup_user(user_msrs->entries, size);
2809         if (IS_ERR(entries)) {
2810                 r = PTR_ERR(entries);
2811                 goto out;
2812         }
2813
2814         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2815         if (r < 0)
2816                 goto out_free;
2817
2818         r = -EFAULT;
2819         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2820                 goto out_free;
2821
2822         r = n;
2823
2824 out_free:
2825         kfree(entries);
2826 out:
2827         return r;
2828 }
2829
2830 static inline bool kvm_can_mwait_in_guest(void)
2831 {
2832         return boot_cpu_has(X86_FEATURE_MWAIT) &&
2833                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
2834                 boot_cpu_has(X86_FEATURE_ARAT);
2835 }
2836
2837 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2838 {
2839         int r = 0;
2840
2841         switch (ext) {
2842         case KVM_CAP_IRQCHIP:
2843         case KVM_CAP_HLT:
2844         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2845         case KVM_CAP_SET_TSS_ADDR:
2846         case KVM_CAP_EXT_CPUID:
2847         case KVM_CAP_EXT_EMUL_CPUID:
2848         case KVM_CAP_CLOCKSOURCE:
2849         case KVM_CAP_PIT:
2850         case KVM_CAP_NOP_IO_DELAY:
2851         case KVM_CAP_MP_STATE:
2852         case KVM_CAP_SYNC_MMU:
2853         case KVM_CAP_USER_NMI:
2854         case KVM_CAP_REINJECT_CONTROL:
2855         case KVM_CAP_IRQ_INJECT_STATUS:
2856         case KVM_CAP_IOEVENTFD:
2857         case KVM_CAP_IOEVENTFD_NO_LENGTH:
2858         case KVM_CAP_PIT2:
2859         case KVM_CAP_PIT_STATE2:
2860         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2861         case KVM_CAP_XEN_HVM:
2862         case KVM_CAP_VCPU_EVENTS:
2863         case KVM_CAP_HYPERV:
2864         case KVM_CAP_HYPERV_VAPIC:
2865         case KVM_CAP_HYPERV_SPIN:
2866         case KVM_CAP_HYPERV_SYNIC:
2867         case KVM_CAP_HYPERV_SYNIC2:
2868         case KVM_CAP_HYPERV_VP_INDEX:
2869         case KVM_CAP_HYPERV_EVENTFD:
2870         case KVM_CAP_PCI_SEGMENT:
2871         case KVM_CAP_DEBUGREGS:
2872         case KVM_CAP_X86_ROBUST_SINGLESTEP:
2873         case KVM_CAP_XSAVE:
2874         case KVM_CAP_ASYNC_PF:
2875         case KVM_CAP_GET_TSC_KHZ:
2876         case KVM_CAP_KVMCLOCK_CTRL:
2877         case KVM_CAP_READONLY_MEM:
2878         case KVM_CAP_HYPERV_TIME:
2879         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2880         case KVM_CAP_TSC_DEADLINE_TIMER:
2881         case KVM_CAP_ENABLE_CAP_VM:
2882         case KVM_CAP_DISABLE_QUIRKS:
2883         case KVM_CAP_SET_BOOT_CPU_ID:
2884         case KVM_CAP_SPLIT_IRQCHIP:
2885         case KVM_CAP_IMMEDIATE_EXIT:
2886         case KVM_CAP_GET_MSR_FEATURES:
2887                 r = 1;
2888                 break;
2889         case KVM_CAP_SYNC_REGS:
2890                 r = KVM_SYNC_X86_VALID_FIELDS;
2891                 break;
2892         case KVM_CAP_ADJUST_CLOCK:
2893                 r = KVM_CLOCK_TSC_STABLE;
2894                 break;
2895         case KVM_CAP_X86_DISABLE_EXITS:
2896                 r |=  KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
2897                 if(kvm_can_mwait_in_guest())
2898                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
2899                 break;
2900         case KVM_CAP_X86_SMM:
2901                 /* SMBASE is usually relocated above 1M on modern chipsets,
2902                  * and SMM handlers might indeed rely on 4G segment limits,
2903                  * so do not report SMM to be available if real mode is
2904                  * emulated via vm86 mode.  Still, do not go to great lengths
2905                  * to avoid userspace's usage of the feature, because it is a
2906                  * fringe case that is not enabled except via specific settings
2907                  * of the module parameters.
2908                  */
2909                 r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
2910                 break;
2911         case KVM_CAP_VAPIC:
2912                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2913                 break;
2914         case KVM_CAP_NR_VCPUS:
2915                 r = KVM_SOFT_MAX_VCPUS;
2916                 break;
2917         case KVM_CAP_MAX_VCPUS:
2918                 r = KVM_MAX_VCPUS;
2919                 break;
2920         case KVM_CAP_NR_MEMSLOTS:
2921                 r = KVM_USER_MEM_SLOTS;
2922                 break;
2923         case KVM_CAP_PV_MMU:    /* obsolete */
2924                 r = 0;
2925                 break;
2926         case KVM_CAP_MCE:
2927                 r = KVM_MAX_MCE_BANKS;
2928                 break;
2929         case KVM_CAP_XCRS:
2930                 r = boot_cpu_has(X86_FEATURE_XSAVE);
2931                 break;
2932         case KVM_CAP_TSC_CONTROL:
2933                 r = kvm_has_tsc_control;
2934                 break;
2935         case KVM_CAP_X2APIC_API:
2936                 r = KVM_X2APIC_API_VALID_FLAGS;
2937                 break;
2938         default:
2939                 break;
2940         }
2941         return r;
2942
2943 }
2944
2945 long kvm_arch_dev_ioctl(struct file *filp,
2946                         unsigned int ioctl, unsigned long arg)
2947 {
2948         void __user *argp = (void __user *)arg;
2949         long r;
2950
2951         switch (ioctl) {
2952         case KVM_GET_MSR_INDEX_LIST: {
2953                 struct kvm_msr_list __user *user_msr_list = argp;
2954                 struct kvm_msr_list msr_list;
2955                 unsigned n;
2956
2957                 r = -EFAULT;
2958                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2959                         goto out;
2960                 n = msr_list.nmsrs;
2961                 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
2962                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2963                         goto out;
2964                 r = -E2BIG;
2965                 if (n < msr_list.nmsrs)
2966                         goto out;
2967                 r = -EFAULT;
2968                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2969                                  num_msrs_to_save * sizeof(u32)))
2970                         goto out;
2971                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2972                                  &emulated_msrs,
2973                                  num_emulated_msrs * sizeof(u32)))
2974                         goto out;
2975                 r = 0;
2976                 break;
2977         }
2978         case KVM_GET_SUPPORTED_CPUID:
2979         case KVM_GET_EMULATED_CPUID: {
2980                 struct kvm_cpuid2 __user *cpuid_arg = argp;
2981                 struct kvm_cpuid2 cpuid;
2982
2983                 r = -EFAULT;
2984                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2985                         goto out;
2986
2987                 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
2988                                             ioctl);
2989                 if (r)
2990                         goto out;
2991
2992                 r = -EFAULT;
2993                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2994                         goto out;
2995                 r = 0;
2996                 break;
2997         }
2998         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2999                 r = -EFAULT;
3000                 if (copy_to_user(argp, &kvm_mce_cap_supported,
3001                                  sizeof(kvm_mce_cap_supported)))
3002                         goto out;
3003                 r = 0;
3004                 break;
3005         case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3006                 struct kvm_msr_list __user *user_msr_list = argp;
3007                 struct kvm_msr_list msr_list;
3008                 unsigned int n;
3009
3010                 r = -EFAULT;
3011                 if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3012                         goto out;
3013                 n = msr_list.nmsrs;
3014                 msr_list.nmsrs = num_msr_based_features;
3015                 if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3016                         goto out;
3017                 r = -E2BIG;
3018                 if (n < msr_list.nmsrs)
3019                         goto out;
3020                 r = -EFAULT;
3021                 if (copy_to_user(user_msr_list->indices, &msr_based_features,
3022                                  num_msr_based_features * sizeof(u32)))
3023                         goto out;
3024                 r = 0;
3025                 break;
3026         }
3027         case KVM_GET_MSRS:
3028                 r = msr_io(NULL, argp, do_get_msr_feature, 1);
3029                 break;
3030         }
3031         default:
3032                 r = -EINVAL;
3033         }
3034 out:
3035         return r;
3036 }
3037
3038 static void wbinvd_ipi(void *garbage)
3039 {
3040         wbinvd();
3041 }
3042
3043 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3044 {
3045         return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3046 }
3047
3048 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3049 {
3050         /* Address WBINVD may be executed by guest */
3051         if (need_emulate_wbinvd(vcpu)) {
3052                 if (kvm_x86_ops->has_wbinvd_exit())
3053                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3054                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3055                         smp_call_function_single(vcpu->cpu,
3056                                         wbinvd_ipi, NULL, 1);
3057         }
3058
3059         kvm_x86_ops->vcpu_load(vcpu, cpu);
3060
3061         /* Apply any externally detected TSC adjustments (due to suspend) */
3062         if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3063                 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3064                 vcpu->arch.tsc_offset_adjustment = 0;
3065                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3066         }
3067
3068         if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3069                 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3070                                 rdtsc() - vcpu->arch.last_host_tsc;
3071                 if (tsc_delta < 0)
3072                         mark_tsc_unstable("KVM discovered backwards TSC");
3073
3074                 if (kvm_check_tsc_unstable()) {
3075                         u64 offset = kvm_compute_tsc_offset(vcpu,
3076                                                 vcpu->arch.last_guest_tsc);
3077                         kvm_vcpu_write_tsc_offset(vcpu, offset);
3078                         vcpu->arch.tsc_catchup = 1;
3079                 }
3080
3081                 if (kvm_lapic_hv_timer_in_use(vcpu))
3082                         kvm_lapic_restart_hv_timer(vcpu);
3083
3084                 /*
3085                  * On a host with synchronized TSC, there is no need to update
3086                  * kvmclock on vcpu->cpu migration
3087                  */
3088                 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3089                         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3090                 if (vcpu->cpu != cpu)
3091                         kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3092                 vcpu->cpu = cpu;
3093         }
3094
3095         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3096 }
3097
3098 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3099 {
3100         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3101                 return;
3102
3103         vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
3104
3105         kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
3106                         &vcpu->arch.st.steal.preempted,
3107                         offsetof(struct kvm_steal_time, preempted),
3108                         sizeof(vcpu->arch.st.steal.preempted));
3109 }
3110
3111 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3112 {
3113         int idx;
3114
3115         if (vcpu->preempted)
3116                 vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
3117
3118         /*
3119          * Disable page faults because we're in atomic context here.
3120          * kvm_write_guest_offset_cached() would call might_fault()
3121          * that relies on pagefault_disable() to tell if there's a
3122          * bug. NOTE: the write to guest memory may not go through if
3123          * during postcopy live migration or if there's heavy guest
3124          * paging.
3125          */
3126         pagefault_disable();
3127         /*
3128          * kvm_memslots() will be called by
3129          * kvm_write_guest_offset_cached() so take the srcu lock.
3130          */
3131         idx = srcu_read_lock(&vcpu->kvm->srcu);
3132         kvm_steal_time_set_preempted(vcpu);
3133         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3134         pagefault_enable();
3135         kvm_x86_ops->vcpu_put(vcpu);
3136         vcpu->arch.last_host_tsc = rdtsc();
3137         /*
3138          * If userspace has set any breakpoints or watchpoints, dr6 is restored
3139          * on every vmexit, but if not, we might have a stale dr6 from the
3140          * guest. do_debug expects dr6 to be cleared after it runs, do the same.
3141          */
3142         set_debugreg(0, 6);
3143 }
3144
3145 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
3146                                     struct kvm_lapic_state *s)
3147 {
3148         if (vcpu->arch.apicv_active)
3149                 kvm_x86_ops->sync_pir_to_irr(vcpu);
3150
3151         return kvm_apic_get_state(vcpu, s);
3152 }
3153
3154 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
3155                                     struct kvm_lapic_state *s)
3156 {
3157         int r;
3158
3159         r = kvm_apic_set_state(vcpu, s);
3160         if (r)
3161                 return r;
3162         update_cr8_intercept(vcpu);
3163
3164         return 0;
3165 }
3166
3167 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
3168 {
3169         return (!lapic_in_kernel(vcpu) ||
3170                 kvm_apic_accept_pic_intr(vcpu));
3171 }
3172
3173 /*
3174  * if userspace requested an interrupt window, check that the
3175  * interrupt window is open.
3176  *
3177  * No need to exit to userspace if we already have an interrupt queued.
3178  */
3179 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
3180 {
3181         return kvm_arch_interrupt_allowed(vcpu) &&
3182                 !kvm_cpu_has_interrupt(vcpu) &&
3183                 !kvm_event_needs_reinjection(vcpu) &&
3184                 kvm_cpu_accept_dm_intr(vcpu);
3185 }
3186
3187 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
3188                                     struct kvm_interrupt *irq)
3189 {
3190         if (irq->irq >= KVM_NR_INTERRUPTS)
3191                 return -EINVAL;
3192
3193         if (!irqchip_in_kernel(vcpu->kvm)) {
3194                 kvm_queue_interrupt(vcpu, irq->irq, false);
3195                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3196                 return 0;
3197         }
3198
3199         /*
3200          * With in-kernel LAPIC, we only use this to inject EXTINT, so
3201          * fail for in-kernel 8259.
3202          */
3203         if (pic_in_kernel(vcpu->kvm))
3204                 return -ENXIO;
3205
3206         if (vcpu->arch.pending_external_vector != -1)
3207                 return -EEXIST;
3208
3209         vcpu->arch.pending_external_vector = irq->irq;
3210         kvm_make_request(KVM_REQ_EVENT, vcpu);
3211         return 0;
3212 }
3213
3214 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
3215 {
3216         kvm_inject_nmi(vcpu);
3217
3218         return 0;
3219 }
3220
3221 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
3222 {
3223         kvm_make_request(KVM_REQ_SMI, vcpu);
3224
3225         return 0;
3226 }
3227
3228 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
3229                                            struct kvm_tpr_access_ctl *tac)
3230 {
3231         if (tac->flags)
3232                 return -EINVAL;
3233         vcpu->arch.tpr_access_reporting = !!tac->enabled;
3234         return 0;
3235 }
3236
3237 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
3238                                         u64 mcg_cap)
3239 {
3240         int r;
3241         unsigned bank_num = mcg_cap & 0xff, bank;
3242
3243         r = -EINVAL;
3244         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
3245                 goto out;
3246         if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
3247                 goto out;
3248         r = 0;
3249         vcpu->arch.mcg_cap = mcg_cap;
3250         /* Init IA32_MCG_CTL to all 1s */
3251         if (mcg_cap & MCG_CTL_P)
3252                 vcpu->arch.mcg_ctl = ~(u64)0;
3253         /* Init IA32_MCi_CTL to all 1s */
3254         for (bank = 0; bank < bank_num; bank++)
3255                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3256
3257         if (kvm_x86_ops->setup_mce)
3258                 kvm_x86_ops->setup_mce(vcpu);
3259 out:
3260         return r;
3261 }
3262
3263 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
3264                                       struct kvm_x86_mce *mce)
3265 {
3266         u64 mcg_cap = vcpu->arch.mcg_cap;
3267         unsigned bank_num = mcg_cap & 0xff;
3268         u64 *banks = vcpu->arch.mce_banks;
3269
3270         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3271                 return -EINVAL;
3272         /*
3273          * if IA32_MCG_CTL is not all 1s, the uncorrected error
3274          * reporting is disabled
3275          */
3276         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3277             vcpu->arch.mcg_ctl != ~(u64)0)
3278                 return 0;
3279         banks += 4 * mce->bank;
3280         /*
3281          * if IA32_MCi_CTL is not all 1s, the uncorrected error
3282          * reporting is disabled for the bank
3283          */
3284         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3285                 return 0;
3286         if (mce->status & MCI_STATUS_UC) {
3287                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3288                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3289                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3290                         return 0;
3291                 }
3292                 if (banks[1] & MCI_STATUS_VAL)
3293                         mce->status |= MCI_STATUS_OVER;
3294                 banks[2] = mce->addr;
3295                 banks[3] = mce->misc;
3296                 vcpu->arch.mcg_status = mce->mcg_status;
3297                 banks[1] = mce->status;
3298                 kvm_queue_exception(vcpu, MC_VECTOR);
3299         } else if (!(banks[1] & MCI_STATUS_VAL)
3300                    || !(banks[1] & MCI_STATUS_UC)) {
3301                 if (banks[1] & MCI_STATUS_VAL)
3302                         mce->status |= MCI_STATUS_OVER;
3303                 banks[2] = mce->addr;
3304                 banks[3] = mce->misc;
3305                 banks[1] = mce->status;
3306         } else
3307                 banks[1] |= MCI_STATUS_OVER;
3308         return 0;
3309 }
3310
3311 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3312                                                struct kvm_vcpu_events *events)
3313 {
3314         process_nmi(vcpu);
3315         /*
3316          * FIXME: pass injected and pending separately.  This is only
3317          * needed for nested virtualization, whose state cannot be
3318          * migrated yet.  For now we can combine them.
3319          */
3320         events->exception.injected =
3321                 (vcpu->arch.exception.pending ||
3322                  vcpu->arch.exception.injected) &&
3323                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
3324         events->exception.nr = vcpu->arch.exception.nr;
3325         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3326         events->exception.pad = 0;
3327         events->exception.error_code = vcpu->arch.exception.error_code;
3328
3329         events->interrupt.injected =
3330                 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3331         events->interrupt.nr = vcpu->arch.interrupt.nr;
3332         events->interrupt.soft = 0;
3333         events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3334
3335         events->nmi.injected = vcpu->arch.nmi_injected;
3336         events->nmi.pending = vcpu->arch.nmi_pending != 0;
3337         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3338         events->nmi.pad = 0;
3339
3340         events->sipi_vector = 0; /* never valid when reporting to user space */
3341
3342         events->smi.smm = is_smm(vcpu);
3343         events->smi.pending = vcpu->arch.smi_pending;
3344         events->smi.smm_inside_nmi =
3345                 !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
3346         events->smi.latched_init = kvm_lapic_latched_init(vcpu);
3347
3348         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3349                          | KVM_VCPUEVENT_VALID_SHADOW
3350                          | KVM_VCPUEVENT_VALID_SMM);
3351         memset(&events->reserved, 0, sizeof(events->reserved));
3352 }
3353
3354 static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
3355
3356 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3357                                               struct kvm_vcpu_events *events)
3358 {
3359         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3360                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3361                               | KVM_VCPUEVENT_VALID_SHADOW
3362                               | KVM_VCPUEVENT_VALID_SMM))
3363                 return -EINVAL;
3364
3365         if (events->exception.injected &&
3366             (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3367              is_guest_mode(vcpu)))
3368                 return -EINVAL;
3369
3370         /* INITs are latched while in SMM */
3371         if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
3372             (events->smi.smm || events->smi.pending) &&
3373             vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3374                 return -EINVAL;
3375
3376         process_nmi(vcpu);
3377         vcpu->arch.exception.injected = false;
3378         vcpu->arch.exception.pending = events->exception.injected;
3379         vcpu->arch.exception.nr = events->exception.nr;
3380         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3381         vcpu->arch.exception.error_code = events->exception.error_code;
3382
3383         vcpu->arch.interrupt.injected = events->interrupt.injected;
3384         vcpu->arch.interrupt.nr = events->interrupt.nr;
3385         vcpu->arch.interrupt.soft = events->interrupt.soft;
3386         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3387                 kvm_x86_ops->set_interrupt_shadow(vcpu,
3388                                                   events->interrupt.shadow);
3389
3390         vcpu->arch.nmi_injected = events->nmi.injected;
3391         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3392                 vcpu->arch.nmi_pending = events->nmi.pending;
3393         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3394
3395         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3396             lapic_in_kernel(vcpu))
3397                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
3398
3399         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3400                 u32 hflags = vcpu->arch.hflags;
3401                 if (events->smi.smm)
3402                         hflags |= HF_SMM_MASK;
3403                 else
3404                         hflags &= ~HF_SMM_MASK;
3405                 kvm_set_hflags(vcpu, hflags);
3406
3407                 vcpu->arch.smi_pending = events->smi.pending;
3408
3409                 if (events->smi.smm) {
3410                         if (events->smi.smm_inside_nmi)
3411                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
3412                         else
3413                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3414                         if (lapic_in_kernel(vcpu)) {
3415                                 if (events->smi.latched_init)
3416                                         set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3417                                 else
3418                                         clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3419                         }
3420                 }
3421         }
3422
3423         kvm_make_request(KVM_REQ_EVENT, vcpu);
3424
3425         return 0;
3426 }
3427
3428 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3429                                              struct kvm_debugregs *dbgregs)
3430 {
3431         unsigned long val;
3432
3433         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3434         kvm_get_dr(vcpu, 6, &val);
3435         dbgregs->dr6 = val;
3436         dbgregs->dr7 = vcpu->arch.dr7;
3437         dbgregs->flags = 0;
3438         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3439 }
3440
3441 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3442                                             struct kvm_debugregs *dbgregs)
3443 {
3444         if (dbgregs->flags)
3445                 return -EINVAL;
3446
3447         if (dbgregs->dr6 & ~0xffffffffull)
3448                 return -EINVAL;
3449         if (dbgregs->dr7 & ~0xffffffffull)
3450                 return -EINVAL;
3451
3452         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3453         kvm_update_dr0123(vcpu);
3454         vcpu->arch.dr6 = dbgregs->dr6;
3455         kvm_update_dr6(vcpu);
3456         vcpu->arch.dr7 = dbgregs->dr7;
3457         kvm_update_dr7(vcpu);
3458
3459         return 0;
3460 }
3461
3462 #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
3463
3464 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3465 {
3466         struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
3467         u64 xstate_bv = xsave->header.xfeatures;
3468         u64 valid;
3469
3470         /*
3471          * Copy legacy XSAVE area, to avoid complications with CPUID
3472          * leaves 0 and 1 in the loop below.
3473          */
3474         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
3475
3476         /* Set XSTATE_BV */
3477         xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
3478         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
3479
3480         /*
3481          * Copy each region from the possibly compacted offset to the
3482          * non-compacted offset.
3483          */
3484         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3485         while (valid) {
3486                 u64 feature = valid & -valid;
3487                 int index = fls64(feature) - 1;
3488                 void *src = get_xsave_addr(xsave, feature);
3489
3490                 if (src) {
3491                         u32 size, offset, ecx, edx;
3492                         cpuid_count(XSTATE_CPUID, index,
3493                                     &size, &offset, &ecx, &edx);
3494                         if (feature == XFEATURE_MASK_PKRU)
3495                                 memcpy(dest + offset, &vcpu->arch.pkru,
3496                                        sizeof(vcpu->arch.pkru));
3497                         else
3498                                 memcpy(dest + offset, src, size);