#include "x86/msr.h"
#include "x86/processor.h"
#include "x86/pmu.h"
#include "x86/apic-defs.h"
#include "x86/apic.h"
#include "x86/desc.h"
#include "x86/isr.h"
#include "vmalloc.h"
#include "alloc.h"

#include "libcflat.h"
#include <stdint.h>

#define N 1000000

#define IBPB_JMP_INSNS		9
#define IBPB_JMP_BRANCHES	2

#if defined(__i386__) || defined(_M_IX86) /* i386 */
#define IBPB_JMP_ASM(_wrmsr)				\
	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
	"mov $73, %%ecx;\n\t"				\
	_wrmsr "\n\t"					\
	"call 1f\n\t"					\
	"1: pop %%eax\n\t"				\
	"add $(2f-1b), %%eax\n\t"			\
	"jmp *%%eax;\n\t"                               \
	"nop;\n\t"					\
	"2: nop;\n\t"
#else /* x86_64 */
#define IBPB_JMP_ASM(_wrmsr)				\
	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
	"mov $73, %%ecx;\n\t"				\
	_wrmsr "\n\t"					\
	"call 1f\n\t"					\
	"1: pop %%rax\n\t"				\
	"add $(2f-1b), %%rax\n\t"                       \
	"jmp *%%rax;\n\t"                               \
	"nop;\n\t"					\
	"2: nop;\n\t"
#endif

/* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
#define EXTRA_INSNS  (3 + 3 + 2 + IBPB_JMP_INSNS)
#define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
#define LOOP_BRANCHES  (N + IBPB_JMP_BRANCHES)
#define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)				\
	_wrmsr1 "\n\t"							\
	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
	_clflush "\n\t"                                 		\
	"mfence;\n\t"                                   		\
	"1: mov (%1), %2; add $64, %1;\n\t"				\
	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
	"loop 1b;\n\t"							\
	IBPB_JMP_ASM(_wrmsr2) 						\
	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
	_wrmsr1 "\n\t"

#define _loop_asm(_wrmsr1, _clflush, _wrmsr2)			\
do {								\
	asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)	\
		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
		       "0"(N), "1"(buf)				\
		     : "edi");					\
} while (0)

/* the number of instructions and branches of the kvm_fep_asm() blob */
#define KVM_FEP_INSNS		22
#define KVM_FEP_BRANCHES	5

/*
 * KVM_FEP is a magic prefix that forces emulation so
 * 'KVM_FEP "jne label\n"' just counts as a single instruction.
 */
#define kvm_fep_asm(_wrmsr)			\
do {						\
	asm volatile(				\
		_wrmsr "\n\t"			\
		"mov %%ecx, %%edi;\n\t"		\
		"mov $0x0, %%eax;\n\t"		\
		"cmp $0x0, %%eax;\n\t"		\
		KVM_FEP "jne 1f\n\t"		\
		KVM_FEP "jne 1f\n\t"		\
		KVM_FEP "jne 1f\n\t"		\
		KVM_FEP "jne 1f\n\t"		\
		KVM_FEP "jne 1f\n\t"		\
		"mov $0xa, %%eax; cpuid;\n\t"	\
		"mov $0xa, %%eax; cpuid;\n\t"	\
		"mov $0xa, %%eax; cpuid;\n\t"	\
		"mov $0xa, %%eax; cpuid;\n\t"	\
		"mov $0xa, %%eax; cpuid;\n\t"	\
		"1: mov %%edi, %%ecx; \n\t"	\
		"xor %%eax, %%eax; \n\t"	\
		"xor %%edx, %%edx;\n\t"		\
		_wrmsr "\n\t"			\
		:				\
		: "a"(eax), "d"(edx), "c"(ecx)	\
		: "ebx", "edi");		\
} while (0)

typedef struct {
	uint32_t ctr;
	uint32_t idx;
	uint64_t config;
	uint64_t count;
} pmu_counter_t;

struct pmu_event {
	const char *name;
	uint32_t unit_sel;
	int min;
	int max;
} intel_gp_events[] = {
	{"core cycles", 0x003c, 1*N, 50*N},
	{"instructions", 0x00c0, 10*N, 10.2*N},
	{"ref cycles", 0x013c, 1*N, 30*N},
	{"llc references", 0x4f2e, 1, 2*N},
	{"llc misses", 0x412e, 1, 1*N},
	{"branches", 0x00c4, 1*N, 1.1*N},
	{"branch misses", 0x00c5, 1, 0.1*N},
}, amd_gp_events[] = {
	{"core cycles", 0x0076, 1*N, 50*N},
	{"instructions", 0x00c0, 10*N, 10.2*N},
	{"branches", 0x00c2, 1*N, 1.1*N},
	{"branch misses", 0x00c3, 1, 0.1*N},
}, fixed_events[] = {
	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
};

/*
 * Events index in intel_gp_events[], ensure consistent with
 * intel_gp_events[].
 */
enum {
	INTEL_INSTRUCTIONS_IDX  = 1,
	INTEL_REF_CYCLES_IDX	= 2,
	INTEL_LLC_MISSES_IDX	= 4,
	INTEL_BRANCHES_IDX	= 5,
	INTEL_BRANCH_MISS_IDX	= 6,
};

/*
 * Events index in amd_gp_events[], ensure consistent with
 * amd_gp_events[].
 */
enum {
	AMD_INSTRUCTIONS_IDX    = 1,
	AMD_BRANCHES_IDX	= 2,
	AMD_BRANCH_MISS_IDX	= 3,
};

char *buf;

static struct pmu_event *gp_events;
static unsigned int gp_events_size;
static unsigned int fixed_counters_num;

static int has_ibpb(void)
{
	return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
	       this_cpu_has(X86_FEATURE_AMD_IBPB);
}

static inline void __loop(void)
{
	unsigned long tmp, tmp2, tmp3;
	u32 global_ctl = 0;
	u32 eax = 0;
	u32 edx = 0;

	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
		_loop_asm("nop", "clflush (%1)", "wrmsr");
	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
		_loop_asm("nop", "clflush (%1)", "nop");
	else if (has_ibpb())
		_loop_asm("nop", "nop", "wrmsr");
	else
		_loop_asm("nop", "nop", "nop");
}

/*
 * Enable and disable counters in a whole asm blob to ensure
 * no other instructions are counted in the window between
 * counters enabling and really LOOP_ASM code executing.
 * Thus counters can verify instructions and branches events
 * against precise counts instead of a rough valid count range.
 */
static inline void __precise_loop(u64 cntrs)
{
	unsigned long tmp, tmp2, tmp3;
	u32 global_ctl = pmu.msr_global_ctl;
	u32 eax = cntrs & (BIT_ULL(32) - 1);
	u32 edx = cntrs >> 32;

	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
		_loop_asm("wrmsr", "clflush (%1)", "wrmsr");
	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
		_loop_asm("wrmsr", "clflush (%1)", "nop");
	else if (has_ibpb())
		_loop_asm("wrmsr", "nop", "wrmsr");
	else
		_loop_asm("wrmsr", "nop", "nop");
}

static inline void loop(u64 cntrs)
{
	if (!this_cpu_has_perf_global_ctrl())
		__loop();
	else
		__precise_loop(cntrs);
}

static void adjust_events_range(struct pmu_event *gp_events,
				int instruction_idx, int branch_idx,
				int branch_miss_idx)
{
	/*
	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
	 * moved in __precise_loop(). Thus, instructions and branches events
	 * can be verified against a precise count instead of a rough range.
	 *
	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
	 * instruction in guest context, which* leads to intermittent failures
	 * as the counts will vary depending on how many asynchronous VM-Exits
	 * occur while running the measured code, e.g. if the host takes IRQs.
	 */
	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
		gp_events[instruction_idx].min = LOOP_INSNS;
		gp_events[instruction_idx].max = LOOP_INSNS;
		gp_events[branch_idx].min = LOOP_BRANCHES;
		gp_events[branch_idx].max = LOOP_BRANCHES;
	}

	/*
	 * For CPUs without IBPB support, no way to force to trigger a branch
	 * miss and the measured branch misses is possible to be 0.  Thus
	 * overwrite the lower boundary of branch misses event to 0 to avoid
	 * false positive.
	 */
	if (!has_ibpb())
		gp_events[branch_miss_idx].min = 0;
}

volatile uint64_t irq_received;

static void cnt_overflow(isr_regs_t *regs)
{
	irq_received++;
	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
	apic_write(APIC_EOI, 0);
}

static bool check_irq(void)
{
	int i;
	irq_received = 0;
	sti();
	for (i = 0; i < 100000 && !irq_received; i++)
		asm volatile("pause");
	cli();
	return irq_received;
}

static bool is_gp(pmu_counter_t *evt)
{
	if (!pmu.is_intel)
		return true;

	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
		evt->ctr >= MSR_IA32_PMC0;
}

static int event_to_global_idx(pmu_counter_t *cnt)
{
	if (pmu.is_intel)
		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));

	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
	else
		return cnt->ctr - pmu.msr_gp_counter_base;
}

static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
{
	if (is_gp(cnt)) {
		int i;

		for (i = 0; i < gp_events_size; i++)
			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
				return &gp_events[i];
	} else {
		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;

		if (idx < ARRAY_SIZE(fixed_events))
			return &fixed_events[idx];
	}

	return (void*)0;
}

static void global_enable(pmu_counter_t *cnt)
{
	if (!this_cpu_has_perf_global_ctrl())
		return;

	cnt->idx = event_to_global_idx(cnt);
	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
}

static void global_disable(pmu_counter_t *cnt)
{
	if (!this_cpu_has_perf_global_ctrl())
		return;

	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
}

static void __start_event(pmu_counter_t *evt, uint64_t count)
{
    evt->count = count;
    wrmsr(evt->ctr, evt->count);
    if (is_gp(evt)) {
	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
		  evt->config | EVNTSEL_EN);
    } else {
	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
	    uint32_t usrospmi = 0;

	    if (evt->config & EVNTSEL_OS)
		    usrospmi |= (1 << 0);
	    if (evt->config & EVNTSEL_USR)
		    usrospmi |= (1 << 1);
	    if (evt->config & EVNTSEL_INT)
		    usrospmi |= (1 << 3); // PMI on overflow
	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
    }
    apic_write(APIC_LVTPC, PMI_VECTOR);
}

static void start_event(pmu_counter_t *evt)
{
	__start_event(evt, 0);
	global_enable(evt);
}

static void __stop_event(pmu_counter_t *evt)
{
	if (is_gp(evt)) {
		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
		      evt->config & ~EVNTSEL_EN);
	} else {
		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
	}
	evt->count = rdmsr(evt->ctr);
}

static void stop_event(pmu_counter_t *evt)
{
	global_disable(evt);
	__stop_event(evt);
}

static noinline void measure_many(pmu_counter_t *evt, int count)
{
	int i;
	u64 cntrs = 0;

	for (i = 0; i < count; i++) {
		__start_event(&evt[i], 0);
		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
	}
	loop(cntrs);
	for (i = 0; i < count; i++)
		__stop_event(&evt[i]);
}

static void measure_one(pmu_counter_t *evt)
{
	measure_many(evt, 1);
}

static noinline void __measure(pmu_counter_t *evt, uint64_t count)
{
	u64 cntrs = BIT_ULL(event_to_global_idx(evt));

	__start_event(evt, count);
	loop(cntrs);
	__stop_event(evt);
}

static bool verify_event(uint64_t count, struct pmu_event *e)
{
	bool pass;

	if (!e)
		return false;

	pass = count >= e->min && count <= e->max;
	if (!pass)
		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);

	return pass;
}

static bool verify_counter(pmu_counter_t *cnt)
{
	return verify_event(cnt->count, get_counter_event(cnt));
}

static void check_gp_counter(struct pmu_event *evt)
{
	pmu_counter_t cnt = {
		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
	};
	int i;

	for (i = 0; i < pmu.nr_gp_counters; i++) {
		cnt.ctr = MSR_GP_COUNTERx(i);
		measure_one(&cnt);
		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
	}
}

static void check_gp_counters(void)
{
	int i;

	for (i = 0; i < gp_events_size; i++)
		if (pmu_gp_counter_is_available(i))
			check_gp_counter(&gp_events[i]);
		else
			printf("GP event '%s' is disabled\n",
					gp_events[i].name);
}

static void check_fixed_counters(void)
{
	pmu_counter_t cnt = {
		.config = EVNTSEL_OS | EVNTSEL_USR,
	};
	int i;

	for (i = 0; i < fixed_counters_num; i++) {
		cnt.ctr = fixed_events[i].unit_sel;
		measure_one(&cnt);
		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
	}
}

static void check_counters_many(void)
{
	pmu_counter_t cnt[48];
	int i, n;

	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
		if (!pmu_gp_counter_is_available(i))
			continue;

		cnt[n].ctr = MSR_GP_COUNTERx(n);
		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
			gp_events[i % gp_events_size].unit_sel;
		n++;
	}
	for (i = 0; i < fixed_counters_num; i++) {
		cnt[n].ctr = fixed_events[i].unit_sel;
		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
		n++;
	}

	assert(n <= ARRAY_SIZE(cnt));
	measure_many(cnt, n);

	for (i = 0; i < n; i++)
		if (!verify_counter(&cnt[i]))
			break;

	report(i == n, "all counters");
}

static uint64_t measure_for_overflow(pmu_counter_t *cnt)
{
	__measure(cnt, 0);
	/*
	 * To generate overflow, i.e. roll over to '0', the initial count just
	 * needs to be preset to the negative expected count.  However, as per
	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
	 * the overflow interrupt is generated immediately instead of possibly
	 * waiting for the overflow to propagate through the counter.
	 */
	assert(cnt->count > 1);
	return 1 - cnt->count;
}

static void check_counter_overflow(void)
{
	int i;
	uint64_t overflow_preset;
	int instruction_idx = pmu.is_intel ?
			      INTEL_INSTRUCTIONS_IDX :
			      AMD_INSTRUCTIONS_IDX;

	pmu_counter_t cnt = {
		.ctr = MSR_GP_COUNTERx(0),
		.config = EVNTSEL_OS | EVNTSEL_USR |
			  gp_events[instruction_idx].unit_sel /* instructions */,
	};
	overflow_preset = measure_for_overflow(&cnt);

	/* clear status before test */
	if (this_cpu_has_perf_global_status())
		pmu_clear_global_status();

	report_prefix_push("overflow");

	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
		uint64_t status;
		int idx;

		cnt.count = overflow_preset;
		if (pmu_use_full_writes())
			cnt.count &= (1ull << pmu.gp_counter_width) - 1;

		if (i == pmu.nr_gp_counters) {
			if (!pmu.is_intel)
				break;

			cnt.ctr = fixed_events[0].unit_sel;
			cnt.count = measure_for_overflow(&cnt);
			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
		} else {
			cnt.ctr = MSR_GP_COUNTERx(i);
		}

		if (i % 2)
			cnt.config |= EVNTSEL_INT;
		else
			cnt.config &= ~EVNTSEL_INT;
		idx = event_to_global_idx(&cnt);
		__measure(&cnt, cnt.count);
		if (pmu.is_intel)
			report(cnt.count == 1, "cntr-%d", i);
		else
			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);

		if (!this_cpu_has_perf_global_status())
			continue;

		status = rdmsr(pmu.msr_global_status);
		report(status & (1ull << idx), "status-%d", i);
		wrmsr(pmu.msr_global_status_clr, status);
		status = rdmsr(pmu.msr_global_status);
		report(!(status & (1ull << idx)), "status clear-%d", i);
		report(check_irq() == (i % 2), "irq-%d", i);
	}

	report_prefix_pop();
}

static void check_gp_counter_cmask(void)
{
	int instruction_idx = pmu.is_intel ?
			      INTEL_INSTRUCTIONS_IDX :
			      AMD_INSTRUCTIONS_IDX;

	pmu_counter_t cnt = {
		.ctr = MSR_GP_COUNTERx(0),
		.config = EVNTSEL_OS | EVNTSEL_USR |
			  gp_events[instruction_idx].unit_sel /* instructions */,
	};
	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
	measure_one(&cnt);
	report(cnt.count < gp_events[instruction_idx].min, "cmask");
}

static void do_rdpmc_fast(void *ptr)
{
	pmu_counter_t *cnt = ptr;
	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);

	if (!is_gp(cnt))
		idx |= 1 << 30;

	cnt->count = rdpmc(idx);
}


static void check_rdpmc(void)
{
	uint64_t val = 0xff0123456789ull;
	bool exc;
	int i;

	report_prefix_push("rdpmc");

	for (i = 0; i < pmu.nr_gp_counters; i++) {
		uint64_t x;
		pmu_counter_t cnt = {
			.ctr = MSR_GP_COUNTERx(i),
			.idx = i
		};

	        /*
	         * Without full-width writes, only the low 32 bits are writable,
	         * and the value is sign-extended.
	         */
		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
			x = (uint64_t)(int64_t)(int32_t)val;
		else
			x = (uint64_t)(int64_t)val;

		/* Mask according to the number of supported bits */
		x &= (1ull << pmu.gp_counter_width) - 1;

		wrmsr(MSR_GP_COUNTERx(i), val);
		report(rdpmc(i) == x, "cntr-%d", i);

		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
		if (exc)
			report_skip("fast-%d", i);
		else
			report(cnt.count == (u32)val, "fast-%d", i);
	}
	for (i = 0; i < fixed_counters_num; i++) {
		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
		pmu_counter_t cnt = {
			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
			.idx = i
		};

		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);

		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
		if (exc)
			report_skip("fixed fast-%d", i);
		else
			report(cnt.count == (u32)x, "fixed fast-%d", i);
	}

	report_prefix_pop();
}

static void check_running_counter_wrmsr(void)
{
	uint64_t status;
	uint64_t count;
	unsigned int instruction_idx = pmu.is_intel ?
				       INTEL_INSTRUCTIONS_IDX :
				       AMD_INSTRUCTIONS_IDX;

	pmu_counter_t evt = {
		.ctr = MSR_GP_COUNTERx(0),
		.config = EVNTSEL_OS | EVNTSEL_USR |
			  gp_events[instruction_idx].unit_sel,
	};

	report_prefix_push("running counter wrmsr");

	start_event(&evt);
	__loop();
	wrmsr(MSR_GP_COUNTERx(0), 0);
	stop_event(&evt);
	report(evt.count < gp_events[instruction_idx].min, "cntr");

	/* clear status before overflow test */
	if (this_cpu_has_perf_global_status())
		pmu_clear_global_status();

	start_event(&evt);

	count = -1;
	if (pmu_use_full_writes())
		count &= (1ull << pmu.gp_counter_width) - 1;

	wrmsr(MSR_GP_COUNTERx(0), count);

	__loop();
	stop_event(&evt);

	if (this_cpu_has_perf_global_status()) {
		status = rdmsr(pmu.msr_global_status);
		report(status & 1, "status msr bit");
	}

	report_prefix_pop();
}

static void check_emulated_instr(void)
{
	u32 eax, edx, ecx;
	uint64_t status, instr_start, brnch_start;
	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
	unsigned int branch_idx = pmu.is_intel ?
				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
	unsigned int instruction_idx = pmu.is_intel ?
				       INTEL_INSTRUCTIONS_IDX :
				       AMD_INSTRUCTIONS_IDX;

	pmu_counter_t brnch_cnt = {
		.ctr = MSR_GP_COUNTERx(0),
		/* branch instructions */
		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
	};
	pmu_counter_t instr_cnt = {
		.ctr = MSR_GP_COUNTERx(1),
		/* instructions */
		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
	};
	report_prefix_push("emulated instruction");

	if (this_cpu_has_perf_global_status())
		pmu_clear_global_status();

	__start_event(&brnch_cnt, 0);
	__start_event(&instr_cnt, 0);

	brnch_start = -KVM_FEP_BRANCHES;
	instr_start = -KVM_FEP_INSNS;
	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);

	if (this_cpu_has_perf_global_ctrl()) {
		eax = BIT(0) | BIT(1);
		ecx = pmu.msr_global_ctl;
		edx = 0;
		kvm_fep_asm("wrmsr");
	} else {
		eax = ecx = edx = 0;
		kvm_fep_asm("nop");
	}

	__stop_event(&brnch_cnt);
	__stop_event(&instr_cnt);

	// Check that the end count - start count is at least the expected
	// number of instructions and branches.
	if (this_cpu_has_perf_global_ctrl()) {
		report(instr_cnt.count - instr_start == KVM_FEP_INSNS,
		       "instruction count");
		report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES,
		       "branch count");
	} else {
		report(instr_cnt.count - instr_start >= KVM_FEP_INSNS,
		       "instruction count");
		report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES,
		       "branch count");
	}

	if (this_cpu_has_perf_global_status()) {
		// Additionally check that those counters overflowed properly.
		status = rdmsr(pmu.msr_global_status);
		report(status & BIT_ULL(0), "branch counter overflow");
		report(status & BIT_ULL(1), "instruction counter overflow");
	}

	report_prefix_pop();
}

#define XBEGIN_STARTED (~0u)
static void check_tsx_cycles(void)
{
	pmu_counter_t cnt;
	unsigned int i, ret = 0;

	if (!this_cpu_has(X86_FEATURE_RTM))
		return;

	report_prefix_push("TSX cycles");

	for (i = 0; i < pmu.nr_gp_counters; i++) {
		cnt.ctr = MSR_GP_COUNTERx(i);

		if (i == 2) {
			/* Transactional cycles committed only on gp counter 2 */
			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
		} else {
			/* Transactional cycles */
			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
		}

		start_event(&cnt);

		asm volatile("xbegin 1f\n\t"
				"1:\n\t"
				: "+a" (ret) :: "memory");

		/* Generate a non-canonical #GP to trigger ABORT. */
		if (ret == XBEGIN_STARTED)
			*(int *)NONCANONICAL = 0;

		stop_event(&cnt);

		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
	}

	report_prefix_pop();
}

static void warm_up(void)
{
	int i;

	/*
	 * Since cycles event is always run as the first event, there would be
	 * a warm-up state to warm up the cache, it leads to the measured cycles
	 * value may exceed the pre-defined cycles upper boundary and cause
	 * false positive. To avoid this, introduce an warm-up state before
	 * the real verification.
	 */
	for (i = 0; i < 10; i++)
		loop(0);
}

static void check_counters(void)
{
	if (is_fep_available())
		check_emulated_instr();

	warm_up();
	check_gp_counters();
	check_fixed_counters();
	check_rdpmc();
	check_counters_many();
	check_counter_overflow();
	check_gp_counter_cmask();
	check_running_counter_wrmsr();
	check_tsx_cycles();
}

static void do_unsupported_width_counter_write(void *index)
{
	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
}

static void check_gp_counters_write_width(void)
{
	u64 val_64 = 0xffffff0123456789ull;
	u64 val_32 = val_64 & ((1ull << 32) - 1);
	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
	int i;

	/*
	 * MSR_IA32_PERFCTRn supports 64-bit writes,
	 * but only the lowest 32 bits are valid.
	 */
	for (i = 0; i < pmu.nr_gp_counters; i++) {
		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);

		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);

		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
	}

	/*
	 * MSR_IA32_PMCn supports writing values up to GP counter width,
	 * and only the lowest bits of GP counter width are valid.
	 */
	for (i = 0; i < pmu.nr_gp_counters; i++) {
		wrmsr(MSR_IA32_PMC0 + i, val_32);
		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);

		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);

		report(test_for_exception(GP_VECTOR,
			do_unsupported_width_counter_write, &i),
		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
	}
}

/*
 * Per the SDM, reference cycles are currently implemented using the
 * core crystal clock, TSC, or bus clock. Calibrate to the TSC
 * frequency to set reasonable expectations.
 */
static void set_ref_cycle_expectations(void)
{
	pmu_counter_t cnt = {
		.ctr = MSR_IA32_PERFCTR0,
		.config = EVNTSEL_OS | EVNTSEL_USR |
			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
	};
	uint64_t tsc_delta;
	uint64_t t0, t1, t2, t3;

	/* Bit 2 enumerates the availability of reference cycles events. */
	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
		return;

	if (this_cpu_has_perf_global_ctrl())
		wrmsr(pmu.msr_global_ctl, 0);

	t0 = fenced_rdtsc();
	start_event(&cnt);
	t1 = fenced_rdtsc();

	/*
	 * This loop has to run long enough to dominate the VM-exit
	 * costs for playing with the PMU MSRs on start and stop.
	 *
	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
	 * the core crystal clock, this function calculated a guest
	 * TSC : ref cycles ratio of around 105 with ECX initialized
	 * to one billion.
	 */
	asm volatile("loop ." : "+c"((int){1000000000ull}));

	t2 = fenced_rdtsc();
	stop_event(&cnt);
	t3 = fenced_rdtsc();

	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;

	if (!tsc_delta)
		return;

	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
}

static void check_invalid_rdpmc_gp(void)
{
	uint64_t val;

	report(rdpmc_safe(64, &val) == GP_VECTOR,
	       "Expected #GP on RDPMC(64)");
}

int main(int ac, char **av)
{
	int instruction_idx;
	int branch_idx;
	int branch_miss_idx;

	setup_vm();
	handle_irq(PMI_VECTOR, cnt_overflow);
	buf = malloc(N*64);

	check_invalid_rdpmc_gp();

	if (pmu.is_intel) {
		if (!pmu.version) {
			report_skip("No Intel Arch PMU is detected!");
			return report_summary();
		}
		gp_events = (struct pmu_event *)intel_gp_events;
		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
		instruction_idx = INTEL_INSTRUCTIONS_IDX;
		branch_idx = INTEL_BRANCHES_IDX;
		branch_miss_idx = INTEL_BRANCH_MISS_IDX;

		/*
		 * For legacy Intel CPUS without clflush/clflushopt support,
		 * there is no way to force to trigger a LLC miss, thus set
		 * the minimum value to 0 to avoid false positives.
		 */
		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
			gp_events[INTEL_LLC_MISSES_IDX].min = 0;

		report_prefix_push("Intel");
		set_ref_cycle_expectations();
	} else {
		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
		gp_events = (struct pmu_event *)amd_gp_events;
		instruction_idx = AMD_INSTRUCTIONS_IDX;
		branch_idx = AMD_BRANCHES_IDX;
		branch_miss_idx = AMD_BRANCH_MISS_IDX;
		report_prefix_push("AMD");
	}
	adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx);

	printf("PMU version:         %d\n", pmu.version);
	printf("GP counters:         %d\n", pmu.nr_gp_counters);
	printf("GP counter width:    %d\n", pmu.gp_counter_width);
	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);

	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
		report_info("Fixed counters number %d > defined fixed events %u.  "
			    "Please update test case.", pmu.nr_fixed_counters,
			    (unsigned)ARRAY_SIZE(fixed_events));

	apic_write(APIC_LVTPC, PMI_VECTOR);

	check_counters();

	if (pmu_has_full_writes()) {
		pmu.msr_gp_counter_base = MSR_IA32_PMC0;

		report_prefix_push("full-width writes");
		check_counters();
		check_gp_counters_write_width();
		report_prefix_pop();
	}

	if (!pmu.is_intel) {
		report_prefix_push("K7");
		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
		check_counters();
		report_prefix_pop();
	}

	return report_summary();
}