x86.c 216 KB
Newer Older
1 2 3 4 5 6
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * derived from drivers/kvm/kvm_main.c
 *
 * Copyright (C) 2006 Qumranet, Inc.
7 8
 * Copyright (C) 2008 Qumranet, Inc.
 * Copyright IBM Corporation, 2008
9
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 11 12 13
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
14 15
 *   Amit Shah    <amit.shah@qumranet.com>
 *   Ben-Ami Yassour <benami@il.ibm.com>
16 17 18 19 20 21
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

22
#include <linux/kvm_host.h>
23
#include "irq.h"
24
#include "mmu.h"
Sheng Yang's avatar
Sheng Yang committed
25
#include "i8254.h"
26
#include "tss.h"
27
#include "kvm_cache_regs.h"
28
#include "x86.h"
29
#include "cpuid.h"
30
#include "assigned-dev.h"
31
#include "pmu.h"
32
#include "hyperv.h"
33

34
#include <linux/clocksource.h>
35
#include <linux/interrupt.h>
36 37 38
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
39
#include <linux/module.h>
40
#include <linux/mman.h>
41
#include <linux/highmem.h>
42
#include <linux/iommu.h>
43
#include <linux/intel-iommu.h>
44
#include <linux/cpufreq.h>
Philippe Gerum's avatar
Philippe Gerum committed
45
#include <linux/ipipe.h>
46
#include <linux/user-return-notifier.h>
47
#include <linux/srcu.h>
48
#include <linux/slab.h>
49
#include <linux/perf_event.h>
50
#include <linux/uaccess.h>
51
#include <linux/hash.h>
52
#include <linux/pci.h>
53 54
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
55 56
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
Avi Kivity's avatar
Avi Kivity committed
57
#include <trace/events/kvm.h>
Xiao Guangrong's avatar
Xiao Guangrong committed
58

59 60
#define CREATE_TRACE_POINTS
#include "trace.h"
61

62
#include <asm/debugreg.h>
63
#include <asm/msr.h>
64
#include <asm/desc.h>
Huang Ying's avatar
Huang Ying committed
65
#include <asm/mce.h>
66
#include <linux/kernel_stat.h>
67
#include <asm/fpu/internal.h> /* Ugh! */
68
#include <asm/pvclock.h>
69
#include <asm/div64.h>
70
#include <asm/irq_remapping.h>
71

72
#define MAX_IO_MSRS 256
Huang Ying's avatar
Huang Ying committed
73
#define KVM_MAX_MCE_BANKS 32
74
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
Huang Ying's avatar
Huang Ying committed
75

76 77 78
#define emul_to_vcpu(ctxt) \
	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

79 80 81 82 83
/* EFER defaults:
 * - enable syscall per default because its emulated by KVM
 * - enable LME and LMA per default on 64 bit KVM
 */
#ifdef CONFIG_X86_64
84 85
static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
86
#else
87
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
88
#endif
89

90 91
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
92

93
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
Avi Kivity's avatar
Avi Kivity committed
94
static void process_nmi(struct kvm_vcpu *vcpu);
95
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
96

97
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
98
EXPORT_SYMBOL_GPL(kvm_x86_ops);
99

100
static bool __read_mostly ignore_msrs = 0;
101
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
102

103 104 105
unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);

106 107 108
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);

109
bool __read_mostly kvm_has_tsc_control;
110
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
111
u32  __read_mostly kvm_max_guest_tsc_khz;
112
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
113 114 115 116
u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64  __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
117
static u64 __read_mostly kvm_default_tsc_scaling_ratio;
118

119
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
120
static u32 __read_mostly tsc_tolerance_ppm = 250;
121 122
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

123
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
124
unsigned int __read_mostly lapic_timer_advance_ns = 0;
125 126
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);

127
static bool __read_mostly backwards_tsc_observed = false;
128

129 130 131 132
#define KVM_NR_SHARED_MSRS 16

struct kvm_shared_msrs_global {
	int nr;
133
	u32 msrs[KVM_NR_SHARED_MSRS];
134 135 136 137 138
};

struct kvm_shared_msrs {
	struct user_return_notifier urn;
	bool registered;
Philippe Gerum's avatar
Philippe Gerum committed
139
	bool dirty;
140 141 142 143
	struct kvm_shared_msr_values {
		u64 host;
		u64 curr;
	} values[KVM_NR_SHARED_MSRS];
144 145 146
};

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
147
static struct kvm_shared_msrs __percpu *shared_msrs;
148

149
struct kvm_stats_debugfs_item debugfs_entries[] = {
150 151 152 153 154 155 156 157 158
	{ "pf_fixed", VCPU_STAT(pf_fixed) },
	{ "pf_guest", VCPU_STAT(pf_guest) },
	{ "tlb_flush", VCPU_STAT(tlb_flush) },
	{ "invlpg", VCPU_STAT(invlpg) },
	{ "exits", VCPU_STAT(exits) },
	{ "io_exits", VCPU_STAT(io_exits) },
	{ "mmio_exits", VCPU_STAT(mmio_exits) },
	{ "signal_exits", VCPU_STAT(signal_exits) },
	{ "irq_window", VCPU_STAT(irq_window_exits) },
159
	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
160
	{ "halt_exits", VCPU_STAT(halt_exits) },
161
	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
162
	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
163
	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
164
	{ "hypercalls", VCPU_STAT(hypercalls) },
165 166 167 168 169 170 171
	{ "request_irq", VCPU_STAT(request_irq_exits) },
	{ "irq_exits", VCPU_STAT(irq_exits) },
	{ "host_state_reload", VCPU_STAT(host_state_reload) },
	{ "efer_reload", VCPU_STAT(efer_reload) },
	{ "fpu_reload", VCPU_STAT(fpu_reload) },
	{ "insn_emulation", VCPU_STAT(insn_emulation) },
	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
172
	{ "irq_injections", VCPU_STAT(irq_injections) },
173
	{ "nmi_injections", VCPU_STAT(nmi_injections) },
174 175 176 177 178 179
	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
	{ "mmu_flooded", VM_STAT(mmu_flooded) },
	{ "mmu_recycled", VM_STAT(mmu_recycled) },
180
	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
181
	{ "mmu_unsync", VM_STAT(mmu_unsync) },
182
	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
183
	{ "largepages", VM_STAT(lpages) },
184 185 186
	{ NULL }
};

187 188
u64 __read_mostly host_xcr0;

189
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
190

191 192 193 194 195 196 197
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
	int i;
	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
		vcpu->arch.apf.gfns[i] = ~0;
}

Philippe Gerum's avatar
Philippe Gerum committed
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
static void kvm_restore_shared_msrs(struct kvm_shared_msrs *locals)
{
	struct kvm_shared_msr_values *values;
	unsigned long flags;
	unsigned int slot;

	flags = hard_cond_local_irq_save();
	if (locals->dirty) {
		for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
			values = &locals->values[slot];
			if (values->host != values->curr) {
				wrmsrl(shared_msrs_global.msrs[slot],
				       values->host);
				values->curr = values->host;
			}
		}
		locals->dirty = false;
	}
	hard_cond_local_irq_restore(flags);
}

219 220 221 222
static void kvm_on_user_return(struct user_return_notifier *urn)
{
	struct kvm_shared_msrs *locals
		= container_of(urn, struct kvm_shared_msrs, urn);
223
	unsigned long flags;
224

225 226 227 228 229 230 231 232 233 234
	/*
	 * Disabling irqs at this point since the following code could be
	 * interrupted and executed through kvm_arch_hardware_disable()
	 */
	local_irq_save(flags);
	if (locals->registered) {
		locals->registered = false;
		user_return_notifier_unregister(urn);
	}
	local_irq_restore(flags);
Philippe Gerum's avatar
Philippe Gerum committed
235 236
	kvm_restore_shared_msrs(locals);
	__ipipe_exit_vm();
237 238
}

239
static void shared_msr_update(unsigned slot, u32 msr)
240 241
{
	u64 value;
242 243
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
244

245 246 247 248 249 250 251 252 253 254 255 256 257
	/* only read, and nobody should modify it at this time,
	 * so don't need lock */
	if (slot >= shared_msrs_global.nr) {
		printk(KERN_ERR "kvm: invalid MSR slot!");
		return;
	}
	rdmsrl_safe(msr, &value);
	smsr->values[slot].host = value;
	smsr->values[slot].curr = value;
}

void kvm_define_shared_msr(unsigned slot, u32 msr)
{
258
	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
259
	shared_msrs_global.msrs[slot] = msr;
260 261 262 263 264 265 266 267 268 269
	if (slot >= shared_msrs_global.nr)
		shared_msrs_global.nr = slot + 1;
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

static void kvm_shared_msr_cpu_online(void)
{
	unsigned i;

	for (i = 0; i < shared_msrs_global.nr; ++i)
270
		shared_msr_update(i, shared_msrs_global.msrs[i]);
271 272
}

273
int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
274
{
275 276
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
277
	int err;
278

279
	if (((value ^ smsr->values[slot].curr) & mask) == 0)
280
		return 0;
281
	smsr->values[slot].curr = value;
282 283 284 285
	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
	if (err)
		return 1;

Philippe Gerum's avatar
Philippe Gerum committed
286
	smsr->dirty = true;
287 288 289 290 291
	if (!smsr->registered) {
		smsr->urn.on_user_return = kvm_on_user_return;
		user_return_notifier_register(&smsr->urn);
		smsr->registered = true;
	}
292
	return 0;
293 294 295
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

296
static void drop_user_return_notifiers(void)
297
{
298 299
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
300 301 302 303 304

	if (smsr->registered)
		kvm_on_user_return(&smsr->urn);
}

305 306
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
307
	return vcpu->arch.apic_base;
308 309 310
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
	u64 old_state = vcpu->arch.apic_base &
		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
	u64 new_state = msr_info->data &
		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
		0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);

	if (!msr_info->host_initiated &&
	    ((msr_info->data & reserved_bits) != 0 ||
	     new_state == X2APIC_ENABLE ||
	     (new_state == MSR_IA32_APICBASE_ENABLE &&
	      old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
	     (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
	      old_state == 0)))
		return 1;

	kvm_lapic_set_base(vcpu, msr_info->data);
	return 0;
331 332 333
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);

334
asmlinkage __visible void kvm_spurious_fault(void)
335 336 337 338 339 340
{
	/* Fault while not rebooting.  We want the trace. */
	BUG();
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
#define EXCPT_BENIGN		0
#define EXCPT_CONTRIBUTORY	1
#define EXCPT_PF		2

static int exception_class(int vector)
{
	switch (vector) {
	case PF_VECTOR:
		return EXCPT_PF;
	case DE_VECTOR:
	case TS_VECTOR:
	case NP_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
		return EXCPT_CONTRIBUTORY;
	default:
		break;
	}
	return EXCPT_BENIGN;
}

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
#define EXCPT_FAULT		0
#define EXCPT_TRAP		1
#define EXCPT_ABORT		2
#define EXCPT_INTERRUPT		3

static int exception_type(int vector)
{
	unsigned int mask;

	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
		return EXCPT_INTERRUPT;

	mask = 1 << vector;

	/* #DB is trap, as instruction watchpoints are handled elsewhere */
	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
		return EXCPT_TRAP;

	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
		return EXCPT_ABORT;

	/* Reserved exceptions will result in fault */
	return EXCPT_FAULT;
}

387
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
388 389
		unsigned nr, bool has_error, u32 error_code,
		bool reinject)
390 391 392 393
{
	u32 prev_nr;
	int class1, class2;

394 395
	kvm_make_request(KVM_REQ_EVENT, vcpu);

396 397
	if (!vcpu->arch.exception.pending) {
	queue:
398 399
		if (has_error && !is_protmode(vcpu))
			has_error = false;
400 401 402 403
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = has_error;
		vcpu->arch.exception.nr = nr;
		vcpu->arch.exception.error_code = error_code;
404
		vcpu->arch.exception.reinject = reinject;
405 406 407 408 409 410 411
		return;
	}

	/* to check exception */
	prev_nr = vcpu->arch.exception.nr;
	if (prev_nr == DF_VECTOR) {
		/* triple fault -> shutdown */
412
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
		return;
	}
	class1 = exception_class(prev_nr);
	class2 = exception_class(nr);
	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
		/* generate double fault per SDM Table 5-5 */
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = true;
		vcpu->arch.exception.nr = DF_VECTOR;
		vcpu->arch.exception.error_code = 0;
	} else
		/* replace previous exception with a new one in a hope
		   that instruction re-execution will regenerate lost
		   exception */
		goto queue;
}

431 432
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
433
	kvm_multiple_exception(vcpu, nr, false, 0, false);
434 435 436
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);

437 438 439 440 441 442
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
	kvm_multiple_exception(vcpu, nr, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);

443
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
444
{
445 446 447 448 449 450
	if (err)
		kvm_inject_gp(vcpu, 0);
	else
		kvm_x86_ops->skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
451

452
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
453 454
{
	++vcpu->stat.pf_guest;
455 456
	vcpu->arch.cr2 = fault->address;
	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
457
}
458
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
459

460
static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
461
{
462 463
	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
464
	else
465
		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
466 467

	return fault->nested_page_fault;
468 469
}

470 471
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
Avi Kivity's avatar
Avi Kivity committed
472 473
	atomic_inc(&vcpu->arch.nmi_queued);
	kvm_make_request(KVM_REQ_NMI, vcpu);
474 475 476
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);

477 478
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
479
	kvm_multiple_exception(vcpu, nr, true, error_code, false);
480 481 482
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

483 484 485 486 487 488
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
	kvm_multiple_exception(vcpu, nr, true, error_code, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

489 490 491 492 493
/*
 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 * a #GP and return false.
 */
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
494
{
495 496 497 498
	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
		return true;
	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	return false;
499
}
500
EXPORT_SYMBOL_GPL(kvm_require_cpl);
501

502 503 504 505 506 507 508 509 510 511
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
		return true;

	kvm_queue_exception(vcpu, UD_VECTOR);
	return false;
}
EXPORT_SYMBOL_GPL(kvm_require_dr);

512 513
/*
 * This function will be used to read from the physical memory of the currently
514
 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
515 516 517 518 519 520
 * can read from guest physical or from the guest's guest physical memory.
 */
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gfn_t ngfn, void *data, int offset, int len,
			    u32 access)
{
521
	struct x86_exception exception;
522 523 524 525
	gfn_t real_gfn;
	gpa_t ngpa;

	ngpa     = gfn_to_gpa(ngfn);
526
	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
527 528 529 530 531
	if (real_gfn == UNMAPPED_GVA)
		return -EFAULT;

	real_gfn = gpa_to_gfn(real_gfn);

532
	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
533 534 535
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

536
static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
537 538 539 540 541 542
			       void *data, int offset, int len, u32 access)
{
	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				       data, offset, len, access);
}

543 544 545
/*
 * Load the pae pdptrs.  Return true is they are all valid.
 */
546
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
547 548 549 550 551
{
	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
	int i;
	int ret;
552
	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
553

554 555 556
	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				      offset * sizeof(u64), sizeof(pdpte),
				      PFERR_USER_MASK|PFERR_WRITE_MASK);
557 558 559 560 561
	if (ret < 0) {
		ret = 0;
		goto out;
	}
	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
562
		if (is_present_gpte(pdpte[i]) &&
563 564
		    (pdpte[i] &
		     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
565 566 567 568 569 570
			ret = 0;
			goto out;
		}
	}
	ret = 1;

571
	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
Avi Kivity's avatar
Avi Kivity committed
572 573 574 575
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_avail);
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_dirty);
576 577 578 579
out:

	return ret;
}
580
EXPORT_SYMBOL_GPL(load_pdptrs);
581

582 583
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
584
	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
585
	bool changed = true;
586 587
	int offset;
	gfn_t gfn;
588 589 590 591 592
	int r;

	if (is_long_mode(vcpu) || !is_pae(vcpu))
		return false;

Avi Kivity's avatar
Avi Kivity committed
593 594 595 596
	if (!test_bit(VCPU_EXREG_PDPTR,
		      (unsigned long *)&vcpu->arch.regs_avail))
		return true;

597 598
	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
599 600
	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				       PFERR_USER_MASK | PFERR_WRITE_MASK);
601 602
	if (r < 0)
		goto out;
603
	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
604 605 606 607 608
out:

	return changed;
}

609
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
610
{
611
	unsigned long old_cr0 = kvm_read_cr0(vcpu);
612
	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
613

614 615
	cr0 |= X86_CR0_ET;

616
#ifdef CONFIG_X86_64
617 618
	if (cr0 & 0xffffffff00000000UL)
		return 1;
619 620 621
#endif

	cr0 &= ~CR0_RESERVED_BITS;
622

623 624
	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
		return 1;
625

626 627
	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
		return 1;
628 629 630

	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
631
		if ((vcpu->arch.efer & EFER_LME)) {
632 633
			int cs_db, cs_l;

634 635
			if (!is_pae(vcpu))
				return 1;
636
			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
637 638
			if (cs_l)
				return 1;
639 640
		} else
#endif
641
		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
642
						 kvm_read_cr3(vcpu)))
643
			return 1;
644 645
	}

646 647 648
	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
		return 1;

649 650
	kvm_x86_ops->set_cr0(vcpu, cr0);

651
	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
652
		kvm_clear_async_pf_completion_queue(vcpu);
653 654
		kvm_async_pf_hash_reset(vcpu);
	}
655

656 657
	if ((cr0 ^ old_cr0) & update_bits)
		kvm_mmu_reset_context(vcpu);
658

659 660 661
	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
662 663
		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);

664 665
	return 0;
}
666
EXPORT_SYMBOL_GPL(kvm_set_cr0);
667

668
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
669
{
670
	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
671
}
672
EXPORT_SYMBOL_GPL(kvm_lmsw);
673

674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
{
	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
			!vcpu->guest_xcr0_loaded) {
		/* kvm_set_xcr() also depends on this */
		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
		vcpu->guest_xcr0_loaded = 1;
	}
}

static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
	if (vcpu->guest_xcr0_loaded) {
		if (vcpu->arch.xcr0 != host_xcr0)
			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
		vcpu->guest_xcr0_loaded = 0;
	}
}

693
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
694
{
695 696
	u64 xcr0 = xcr;
	u64 old_xcr0 = vcpu->arch.xcr0;
697
	u64 valid_bits;
698 699 700 701

	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
	if (index != XCR_XFEATURE_ENABLED_MASK)
		return 1;
702
	if (!(xcr0 & XFEATURE_MASK_FP))
703
		return 1;
704
	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
705
		return 1;
706 707 708 709 710 711

	/*
	 * Do not allow the guest to set bits that we do not support
	 * saving.  However, xcr0 bit 0 is always set, even if the
	 * emulated CPU does not support XSAVE (see fx_init).
	 */
712
	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
713
	if (xcr0 & ~valid_bits)
714
		return 1;
715

716 717
	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
718 719
		return 1;

720 721
	if (xcr0 & XFEATURE_MASK_AVX512) {
		if (!(xcr0 & XFEATURE_MASK_YMM))
722
			return 1;
723
		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
724 725
			return 1;
	}
726
	vcpu->arch.xcr0 = xcr0;
727

728
	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
729
		kvm_update_cpuid(vcpu);
730 731 732 733 734
	return 0;
}

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
735 736
	if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
	    __kvm_set_xcr(vcpu, index, xcr)) {
737 738 739 740 741 742 743
		kvm_inject_gp(vcpu, 0);
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);

744
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
745
{
746
	unsigned long old_cr4 = kvm_read_cr4(vcpu);
747 748 749
	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
				   X86_CR4_SMEP | X86_CR4_SMAP;

750 751
	if (cr4 & CR4_RESERVED_BITS)
		return 1;
752

753 754 755
	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
		return 1;

756 757 758
	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
		return 1;

759 760 761
	if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
		return 1;

762
	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
763 764
		return 1;

765
	if (is_long_mode(vcpu)) {
766 767
		if (!(cr4 & X86_CR4_PAE))
			return 1;
768 769
	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
		   && ((cr4 ^ old_cr4) & pdptr_bits)
770 771
		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				   kvm_read_cr3(vcpu)))
772 773
		return 1;

774 775 776 777 778
	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
		if (!guest_cpuid_has_pcid(vcpu))
			return 1;

		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
779 780
		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
		    !is_long_mode(vcpu))
781 782 783
			return 1;
	}

784
	if (kvm_x86_ops->set_cr4(vcpu, cr4))
785
		return 1;
786

787 788
	if (((cr4 ^ old_cr4) & pdptr_bits) ||
	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
789
		kvm_mmu_reset_context(vcpu);
790

791
	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
792
		kvm_update_cpuid(vcpu);
793

794 795
	return 0;
}
796
EXPORT_SYMBOL_GPL(kvm_set_cr4);
797

798
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
799
{
800
#ifdef CONFIG_X86_64
801
	cr3 &= ~CR3_PCID_INVD;
802
#endif
803

804
	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
805
		kvm_mmu_sync_roots(vcpu);
806
		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
807
		return 0;
808 809
	}

810
	if (is_long_mode(vcpu)) {
811 812 813 814
		if (cr3 & CR3_L_MODE_RESERVED_BITS)
			return 1;
	} else if (is_pae(vcpu) && is_paging(vcpu) &&
		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
815
		return 1;
816

817
	vcpu->arch.cr3 = cr3;
818
	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
819
	kvm_mmu_new_cr3(vcpu);
820 821
	return 0;
}
822
EXPORT_SYMBOL_GPL(kvm_set_cr3);
823

824
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
825
{
826 827
	if (cr8 & CR8_RESERVED_BITS)
		return 1;
828
	if (lapic_in_kernel(vcpu))
829 830
		kvm_lapic_set_tpr(vcpu, cr8);
	else
831
		vcpu->arch.cr8 = cr8;
832 833
	return 0;
}
834
EXPORT_SYMBOL_GPL(kvm_set_cr8);
835

836
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
837
{
838
	if (lapic_in_kernel(vcpu))
839 840
		return kvm_lapic_get_cr8(vcpu);
	else
841
		return vcpu->arch.cr8;
842
}
843
EXPORT_SYMBOL_GPL(kvm_get_cr8);
844

845 846 847 848 849 850 851 852 853 854 855
static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
{
	int i;

	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
		for (i = 0; i < KVM_NR_DB_REGS; i++)
			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
	}
}

856 857 858 859 860 861
static void kvm_update_dr6(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
}

862 863 864 865 866 867 868 869 870
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
	unsigned long dr7;

	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
		dr7 = vcpu->arch.guest_debug_dr7;
	else
		dr7 = vcpu->arch.dr7;
	kvm_x86_ops->set_dr7(vcpu, dr7);
871 872 873
	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
	if (dr7 & DR7_BP_EN_MASK)
		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
874 875
}

876 877 878 879 880 881 882 883 884
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
	u64 fixed = DR6_FIXED_1;

	if (!guest_cpuid_has_rtm(vcpu))
		fixed |= DR6_RTM;
	return fixed;
}

885
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
886 887 888 889 890 891 892 893 894 895
{
	switch (dr) {
	case 0 ... 3:
		vcpu->arch.db[dr] = val;
		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
			vcpu->arch.eff_db[dr] = val;
		break;
	case 4:
		/* fall through */
	case 6:
896 897
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
898
		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
899
		kvm_update_dr6(vcpu);
900 901 902 903
		break;
	case 5:
		/* fall through */
	default: /* 7 */
904 905
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
906
		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
907
		kvm_update_dr7(vcpu);
908 909 910 911 912
		break;
	}

	return 0;
}
913 914 915

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
916
	if (__kvm_set_dr(vcpu, dr, val)) {
917
		kvm_inject_gp(vcpu, 0);
918 919 920
		return 1;
	}
	return 0;
921
}
922 923
EXPORT_SYMBOL_GPL(kvm_set_dr);

924
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
925 926 927 928 929 930 931 932
{
	switch (dr) {
	case 0 ... 3:
		*val = vcpu->arch.db[dr];
		break;
	case 4:
		/* fall through */
	case 6:
933 934 935 936
		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
			*val = vcpu->arch.dr6;
		else
			*val = kvm_x86_ops->get_dr6(vcpu);
937 938 939 940 941 942 943
		break;
	case 5:
		/* fall through */
	default: /* 7 */
		*val = vcpu->arch.dr7;
		break;
	}
944 945
	return 0;
}
946 947
EXPORT_SYMBOL_GPL(kvm_get_dr);

948 949 950 951 952 953
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	u64 data;
	int err;

954
	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
955 956 957 958 959 960 961 962
	if (err)
		return err;
	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);

963 964 965 966 967
/*
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 *
 * This list is modified at module load time to reflect the
968
 * capabilities of the host cpu. This capabilities test skips MSRs that are
969 970
 * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
 * may depend on host virtualization features rather than host cpu features.
971
 */
972

973 974
static u32 msrs_to_save[] = {
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
975
	MSR_STAR,
976 977 978
#ifdef CONFIG_X86_64
	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
979
	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
980
	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
981
	MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
982 983 984 985
};

static unsigned num_msrs_to_save;

986 987 988 989 990
static u32 emulated_msrs[] = {
	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
991 992
	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
993
	HV_X64_MSR_RESET,
994
	HV_X64_MSR_VP_INDEX,
995
	HV_X64_MSR_VP_RUNTIME,
996 997 998
	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
	MSR_KVM_PV_EOI_EN,

999
	MSR_IA32_TSC_ADJUST,
1000
	MSR_IA32_TSCDEADLINE,
1001
	MSR_IA32_MISC_ENABLE,
1002 1003
	MSR_IA32_MCG_STATUS,
	MSR_IA32_MCG_CTL,
1004
	MSR_IA32_SMBASE,
1005
	MSR_AMD64_VIRT_SPEC_CTRL,
1006 1007
};

1008 1009
static unsigned num_emulated_msrs;

1010
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1011
{
1012
	if (efer & efer_reserved_bits)
1013
		return false;
1014

Alexander Graf's avatar
Alexander Graf committed
1015 1016 1017 1018
	if (efer & EFER_FFXSR) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
1019
		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
1020
			return false;
Alexander Graf's avatar
Alexander Graf committed
1021 1022
	}

1023 1024 1025 1026
	if (efer & EFER_SVME) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
1027
		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
1028
			return false;
1029 1030
	}

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
	return true;
}
EXPORT_SYMBOL_GPL(kvm_valid_efer);

static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
	u64 old_efer = vcpu->arch.efer;

	if (!kvm_valid_efer(vcpu, efer))
		return 1;

	if (is_paging(vcpu)
	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
		return 1;

1046
	efer &= ~EFER_LMA;
1047
	efer |= vcpu->arch.efer & EFER_LMA;
1048

1049 1050
	kvm_x86_ops->set_efer(vcpu, efer);

1051 1052 1053 1054
	/* Update reserved bits */
	if ((efer ^ old_efer) & EFER_NX)
		kvm_mmu_reset_context(vcpu);

1055
	return 0;
1056 1057
}

1058 1059 1060 1061 1062 1063
void kvm_enable_efer_bits(u64 mask)
{
       efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);

1064 1065 1066 1067 1068
/*
 * Writes msr value into into the appropriate "register".
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
1069
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1070
{
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
	switch (msr->index) {
	case MSR_FS_BASE:
	case MSR_GS_BASE:
	case MSR_KERNEL_GS_BASE:
	case MSR_CSTAR:
	case MSR_LSTAR:
		if (is_noncanonical_address(msr->data))
			return 1;
		break;
	case MSR_IA32_SYSENTER_EIP:
	case MSR_IA32_SYSENTER_ESP:
		/*
		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
		 * non-canonical address is written on Intel but not on
		 * AMD (which ignores the top 32-bits, because it does
		 * not implement 64-bit SYSENTER).
		 *
		 * 64-bit code should hence be able to write a non-canonical
		 * value on AMD.  Making the address canonical ensures that
		 * vmentry does not fail on Intel after writing a non-canonical
		 * value, and that something deterministic happens if the guest
		 * invokes 64-bit SYSENTER.
		 */
		msr->data = get_canonical(msr->data);
	}
1096
	return kvm_x86_ops->set_msr(vcpu, msr);
1097
}
1098
EXPORT_SYMBOL_GPL(kvm_set_msr);
1099

1100 1101 1102
/*
 * Adapt set_msr() to msr_io()'s calling convention
 */
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
	struct msr_data msr;
	int r;

	msr.index = index;
	msr.host_initiated = true;
	r = kvm_get_msr(vcpu, &msr);
	if (r)
		return r;

	*data = msr.data;
	return 0;
}

1118 1119
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
1120 1121 1122 1123 1124 1125
	struct msr_data msr;

	msr.data = *data;
	msr.index = index;
	msr.host_initiated = true;
	return kvm_set_msr(vcpu, &msr);
1126 1127
}

1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
#ifdef CONFIG_X86_64
struct pvclock_gtod_data {
	seqcount_t	seq;

	struct { /* extract of a clocksource struct */
		int vclock_mode;
		cycle_t	cycle_last;
		cycle_t	mask;
		u32	mult;
		u32	shift;
	} clock;

1140 1141
	u64		boot_ns;
	u64		nsec_base;
1142 1143 1144 1145 1146 1147 1148
};

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1149 1150
	u64 boot_ns;

1151
	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1152 1153 1154 1155

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
1156 1157 1158 1159 1160
	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
	vdata->clock.mask		= tk->tkr_mono.mask;
	vdata->clock.mult		= tk->tkr_mono.mult;
	vdata->clock.shift		= tk->tkr_mono.shift;
1161

1162
	vdata->boot_ns			= boot_ns;
1163
	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
1164 1165 1166 1167 1168

	write_seqcount_end(&vdata->seq);
}
#endif

1169 1170 1171 1172 1173 1174 1175 1176 1177
void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
{
	/*
	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
	 * vcpu_enter_guest.  This function is only called from
	 * the physical CPU that is running vcpu.
	 */
	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
1178

1179 1180
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
1181 1182
	int version;
	int r;
1183
	struct pvclock_wall_clock wc;
1184
	struct timespec boot;
1185 1186 1187 1188

	if (!wall_clock)
		return;

1189 1190 1191 1192 1193 1194 1195 1196
	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
	if (r)
		return;

	if (version & 1)
		++version;  /* first time write, random junk */

	++version;
1197 1198 1199

	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

1200 1201
	/*
	 * The guest calculates current wall clock time by adding
1202
	 * system time (updated by kvm_guest_time_update below) to the
1203 1204 1205
	 * wall clock specified here.  guest system time equals host
	 * system time for us, thus we must fill in host boot time here.
	 */
1206
	getboottime(&boot);
1207

1208 1209 1210 1211
	if (kvm->arch.kvmclock_offset) {
		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
		boot = timespec_sub(boot, ts);
	}
1212 1213 1214
	wc.sec = boot.tv_sec;
	wc.nsec = boot.tv_nsec;
	wc.version = version;
1215 1216 1217 1218 1219 1220 1221

	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));

	version++;
	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}

1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
	uint32_t quotient, remainder;

	/* Don't try to replace with do_div(), this one calculates
	 * "(dividend << 32) / divisor" */
	__asm__ ( "divl %4"
		  : "=a" (quotient), "=d" (remainder)
		  : "0" (0), "1" (dividend), "r" (divisor) );
	return quotient;
}

1234 1235
static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
			       s8 *pshift, u32 *pmultiplier)
1236
{
1237
	uint64_t scaled64;
1238 1239 1240 1241
	int32_t  shift = 0;
	uint64_t tps64;
	uint32_t tps32;

1242 1243
	tps64 = base_khz * 1000LL;
	scaled64 = scaled_khz * 1000LL;
1244
	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1245 1246 1247 1248 1249
		tps64 >>= 1;
		shift--;
	}

	tps32 = (uint32_t)tps64;
1250 1251
	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1252 1253 1254
			scaled64 >>= 1;
		else
			tps32 <<= 1;
1255 1256 1257
		shift++;
	}

1258 1259
	*pshift = shift;
	*pmultiplier = div_frac(scaled64, tps32);
1260

1261 1262
	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
1263 1264
}

1265
#ifdef CONFIG_X86_64
1266
static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1267
#endif
1268

1269
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1270
static unsigned long max_tsc_khz;
1271

1272
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1273
{
1274 1275
	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
				   vcpu->arch.virtual_tsc_shift);
1276 1277
}

1278
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
Joerg Roedel's avatar