xref: /kvm-unit-tests/x86/kvmclock.c (revision a7053e6ddebc90bd528eac11e420e9199bf2fd94)
1*a7053e6dSJason Wang #include "libcflat.h"
2*a7053e6dSJason Wang #include "smp.h"
3*a7053e6dSJason Wang #include "atomic.h"
4*a7053e6dSJason Wang #include "processor.h"
5*a7053e6dSJason Wang #include "kvmclock.h"
6*a7053e6dSJason Wang 
7*a7053e6dSJason Wang #define unlikely(x)	__builtin_expect(!!(x), 0)
8*a7053e6dSJason Wang #define likely(x)	__builtin_expect(!!(x), 1)
9*a7053e6dSJason Wang 
10*a7053e6dSJason Wang 
11*a7053e6dSJason Wang struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
12*a7053e6dSJason Wang struct pvclock_wall_clock wall_clock;
13*a7053e6dSJason Wang static unsigned char valid_flags = 0;
14*a7053e6dSJason Wang static atomic64_t last_value = ATOMIC64_INIT(0);
15*a7053e6dSJason Wang 
16*a7053e6dSJason Wang /*
17*a7053e6dSJason Wang  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
18*a7053e6dSJason Wang  * yielding a 64-bit result.
19*a7053e6dSJason Wang  */
20*a7053e6dSJason Wang static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
21*a7053e6dSJason Wang {
22*a7053e6dSJason Wang 	u64 product;
23*a7053e6dSJason Wang #ifdef __i386__
24*a7053e6dSJason Wang 	u32 tmp1, tmp2;
25*a7053e6dSJason Wang #endif
26*a7053e6dSJason Wang 
27*a7053e6dSJason Wang 	if (shift < 0)
28*a7053e6dSJason Wang 		delta >>= -shift;
29*a7053e6dSJason Wang 	else
30*a7053e6dSJason Wang 		delta <<= shift;
31*a7053e6dSJason Wang 
32*a7053e6dSJason Wang #ifdef __i386__
33*a7053e6dSJason Wang 	__asm__ (
34*a7053e6dSJason Wang 		"mul  %5       ; "
35*a7053e6dSJason Wang 		"mov  %4,%%eax ; "
36*a7053e6dSJason Wang 		"mov  %%edx,%4 ; "
37*a7053e6dSJason Wang 		"mul  %5       ; "
38*a7053e6dSJason Wang 		"xor  %5,%5    ; "
39*a7053e6dSJason Wang 		"add  %4,%%eax ; "
40*a7053e6dSJason Wang 		"adc  %5,%%edx ; "
41*a7053e6dSJason Wang 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
42*a7053e6dSJason Wang 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
43*a7053e6dSJason Wang #elif defined(__x86_64__)
44*a7053e6dSJason Wang 	__asm__ (
45*a7053e6dSJason Wang 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
46*a7053e6dSJason Wang 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
47*a7053e6dSJason Wang #else
48*a7053e6dSJason Wang #error implement me!
49*a7053e6dSJason Wang #endif
50*a7053e6dSJason Wang 
51*a7053e6dSJason Wang 	return product;
52*a7053e6dSJason Wang }
53*a7053e6dSJason Wang 
54*a7053e6dSJason Wang #ifdef __i386__
55*a7053e6dSJason Wang # define do_div(n,base) ({					\
56*a7053e6dSJason Wang 	u32 __base = (base);    				\
57*a7053e6dSJason Wang 	u32 __rem;						\
58*a7053e6dSJason Wang 	__rem = ((u64)(n)) % __base;                            \
59*a7053e6dSJason Wang 	(n) = ((u64)(n)) / __base;				\
60*a7053e6dSJason Wang 	__rem;							\
61*a7053e6dSJason Wang  })
62*a7053e6dSJason Wang #else
63*a7053e6dSJason Wang u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
64*a7053e6dSJason Wang {
65*a7053e6dSJason Wang 	u64 rem = *n;
66*a7053e6dSJason Wang 	u64 b = base;
67*a7053e6dSJason Wang 	u64 res, d = 1;
68*a7053e6dSJason Wang 	u32 high = rem >> 32;
69*a7053e6dSJason Wang 
70*a7053e6dSJason Wang 	/* Reduce the thing a bit first */
71*a7053e6dSJason Wang 	res = 0;
72*a7053e6dSJason Wang 	if (high >= base) {
73*a7053e6dSJason Wang 		high /= base;
74*a7053e6dSJason Wang 		res = (u64) high << 32;
75*a7053e6dSJason Wang 		rem -= (u64) (high*base) << 32;
76*a7053e6dSJason Wang 	}
77*a7053e6dSJason Wang 
78*a7053e6dSJason Wang 	while ((s64)b > 0 && b < rem) {
79*a7053e6dSJason Wang 		b = b+b;
80*a7053e6dSJason Wang 		d = d+d;
81*a7053e6dSJason Wang 	}
82*a7053e6dSJason Wang 
83*a7053e6dSJason Wang 	do {
84*a7053e6dSJason Wang 		if (rem >= b) {
85*a7053e6dSJason Wang 			rem -= b;
86*a7053e6dSJason Wang 			res += d;
87*a7053e6dSJason Wang 		}
88*a7053e6dSJason Wang 		b >>= 1;
89*a7053e6dSJason Wang 		d >>= 1;
90*a7053e6dSJason Wang 	} while (d);
91*a7053e6dSJason Wang 
92*a7053e6dSJason Wang 	*n = res;
93*a7053e6dSJason Wang 	return rem;
94*a7053e6dSJason Wang }
95*a7053e6dSJason Wang 
96*a7053e6dSJason Wang # define do_div(n,base) ({				\
97*a7053e6dSJason Wang 	u32 __base = (base);    			\
98*a7053e6dSJason Wang 	u32 __rem;					\
99*a7053e6dSJason Wang 	(void)(((typeof((n)) *)0) == ((u64 *)0));	\
100*a7053e6dSJason Wang 	if (likely(((n) >> 32) == 0)) {			\
101*a7053e6dSJason Wang 		__rem = (u32)(n) % __base;		\
102*a7053e6dSJason Wang 		(n) = (u32)(n) / __base;		\
103*a7053e6dSJason Wang 	} else 						\
104*a7053e6dSJason Wang 		__rem = __div64_32(&(n), __base);	\
105*a7053e6dSJason Wang 	__rem;						\
106*a7053e6dSJason Wang  })
107*a7053e6dSJason Wang #endif
108*a7053e6dSJason Wang 
109*a7053e6dSJason Wang /**
110*a7053e6dSJason Wang  * set_normalized_timespec - set timespec sec and nsec parts and normalize
111*a7053e6dSJason Wang  *
112*a7053e6dSJason Wang  * @ts:		pointer to timespec variable to be set
113*a7053e6dSJason Wang  * @sec:	seconds to set
114*a7053e6dSJason Wang  * @nsec:	nanoseconds to set
115*a7053e6dSJason Wang  *
116*a7053e6dSJason Wang  * Set seconds and nanoseconds field of a timespec variable and
117*a7053e6dSJason Wang  * normalize to the timespec storage format
118*a7053e6dSJason Wang  *
119*a7053e6dSJason Wang  * Note: The tv_nsec part is always in the range of
120*a7053e6dSJason Wang  *	0 <= tv_nsec < NSEC_PER_SEC
121*a7053e6dSJason Wang  * For negative values only the tv_sec field is negative !
122*a7053e6dSJason Wang  */
123*a7053e6dSJason Wang void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
124*a7053e6dSJason Wang {
125*a7053e6dSJason Wang 	while (nsec >= NSEC_PER_SEC) {
126*a7053e6dSJason Wang 		/*
127*a7053e6dSJason Wang 		 * The following asm() prevents the compiler from
128*a7053e6dSJason Wang 		 * optimising this loop into a modulo operation. See
129*a7053e6dSJason Wang 		 * also __iter_div_u64_rem() in include/linux/time.h
130*a7053e6dSJason Wang 		 */
131*a7053e6dSJason Wang 		asm("" : "+rm"(nsec));
132*a7053e6dSJason Wang 		nsec -= NSEC_PER_SEC;
133*a7053e6dSJason Wang 		++sec;
134*a7053e6dSJason Wang 	}
135*a7053e6dSJason Wang 	while (nsec < 0) {
136*a7053e6dSJason Wang 		asm("" : "+rm"(nsec));
137*a7053e6dSJason Wang 		nsec += NSEC_PER_SEC;
138*a7053e6dSJason Wang 		--sec;
139*a7053e6dSJason Wang 	}
140*a7053e6dSJason Wang 	ts->tv_sec = sec;
141*a7053e6dSJason Wang 	ts->tv_nsec = nsec;
142*a7053e6dSJason Wang }
143*a7053e6dSJason Wang 
144*a7053e6dSJason Wang static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
145*a7053e6dSJason Wang {
146*a7053e6dSJason Wang 	u64 delta = rdtsc() - shadow->tsc_timestamp;
147*a7053e6dSJason Wang 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
148*a7053e6dSJason Wang }
149*a7053e6dSJason Wang 
150*a7053e6dSJason Wang /*
151*a7053e6dSJason Wang  * Reads a consistent set of time-base values from hypervisor,
152*a7053e6dSJason Wang  * into a shadow data area.
153*a7053e6dSJason Wang  */
154*a7053e6dSJason Wang static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
155*a7053e6dSJason Wang 					struct pvclock_vcpu_time_info *src)
156*a7053e6dSJason Wang {
157*a7053e6dSJason Wang 	do {
158*a7053e6dSJason Wang 		dst->version = src->version;
159*a7053e6dSJason Wang 		rmb();		/* fetch version before data */
160*a7053e6dSJason Wang 		dst->tsc_timestamp     = src->tsc_timestamp;
161*a7053e6dSJason Wang 		dst->system_timestamp  = src->system_time;
162*a7053e6dSJason Wang 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
163*a7053e6dSJason Wang 		dst->tsc_shift         = src->tsc_shift;
164*a7053e6dSJason Wang 		dst->flags             = src->flags;
165*a7053e6dSJason Wang 		rmb();		/* test version after fetching data */
166*a7053e6dSJason Wang 	} while ((src->version & 1) || (dst->version != src->version));
167*a7053e6dSJason Wang 
168*a7053e6dSJason Wang 	return dst->version;
169*a7053e6dSJason Wang }
170*a7053e6dSJason Wang 
171*a7053e6dSJason Wang cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
172*a7053e6dSJason Wang {
173*a7053e6dSJason Wang 	struct pvclock_shadow_time shadow;
174*a7053e6dSJason Wang 	unsigned version;
175*a7053e6dSJason Wang 	cycle_t ret, offset;
176*a7053e6dSJason Wang 	u64 last;
177*a7053e6dSJason Wang 
178*a7053e6dSJason Wang 	do {
179*a7053e6dSJason Wang 		version = pvclock_get_time_values(&shadow, src);
180*a7053e6dSJason Wang 		barrier();
181*a7053e6dSJason Wang 		offset = pvclock_get_nsec_offset(&shadow);
182*a7053e6dSJason Wang 		ret = shadow.system_timestamp + offset;
183*a7053e6dSJason Wang 		barrier();
184*a7053e6dSJason Wang 	} while (version != src->version);
185*a7053e6dSJason Wang 
186*a7053e6dSJason Wang 	if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
187*a7053e6dSJason Wang             ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
188*a7053e6dSJason Wang              (shadow.flags & PVCLOCK_TSC_STABLE_BIT)))
189*a7053e6dSJason Wang                 return ret;
190*a7053e6dSJason Wang 
191*a7053e6dSJason Wang 	/*
192*a7053e6dSJason Wang 	 * Assumption here is that last_value, a global accumulator, always goes
193*a7053e6dSJason Wang 	 * forward. If we are less than that, we should not be much smaller.
194*a7053e6dSJason Wang 	 * We assume there is an error marging we're inside, and then the correction
195*a7053e6dSJason Wang 	 * does not sacrifice accuracy.
196*a7053e6dSJason Wang 	 *
197*a7053e6dSJason Wang 	 * For reads: global may have changed between test and return,
198*a7053e6dSJason Wang 	 * but this means someone else updated poked the clock at a later time.
199*a7053e6dSJason Wang 	 * We just need to make sure we are not seeing a backwards event.
200*a7053e6dSJason Wang 	 *
201*a7053e6dSJason Wang 	 * For updates: last_value = ret is not enough, since two vcpus could be
202*a7053e6dSJason Wang 	 * updating at the same time, and one of them could be slightly behind,
203*a7053e6dSJason Wang 	 * making the assumption that last_value always go forward fail to hold.
204*a7053e6dSJason Wang 	 */
205*a7053e6dSJason Wang 	last = atomic64_read(&last_value);
206*a7053e6dSJason Wang 	do {
207*a7053e6dSJason Wang 		if (ret < last)
208*a7053e6dSJason Wang 			return last;
209*a7053e6dSJason Wang 		last = atomic64_cmpxchg(&last_value, last, ret);
210*a7053e6dSJason Wang 	} while (unlikely(last != ret));
211*a7053e6dSJason Wang 
212*a7053e6dSJason Wang 	return ret;
213*a7053e6dSJason Wang }
214*a7053e6dSJason Wang 
215*a7053e6dSJason Wang cycle_t kvm_clock_read()
216*a7053e6dSJason Wang {
217*a7053e6dSJason Wang         struct pvclock_vcpu_time_info *src;
218*a7053e6dSJason Wang         cycle_t ret;
219*a7053e6dSJason Wang         int index = smp_id();
220*a7053e6dSJason Wang 
221*a7053e6dSJason Wang         src = &hv_clock[index];
222*a7053e6dSJason Wang         ret = pvclock_clocksource_read(src);
223*a7053e6dSJason Wang         return ret;
224*a7053e6dSJason Wang }
225*a7053e6dSJason Wang 
226*a7053e6dSJason Wang void kvm_clock_init(void *data)
227*a7053e6dSJason Wang {
228*a7053e6dSJason Wang         int index = smp_id();
229*a7053e6dSJason Wang         struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
230*a7053e6dSJason Wang 
231*a7053e6dSJason Wang         printf("kvm-clock: cpu %d, msr 0x:%lx \n", index, hvc);
232*a7053e6dSJason Wang         wrmsr(MSR_KVM_SYSTEM_TIME, (unsigned long)hvc | 1);
233*a7053e6dSJason Wang }
234*a7053e6dSJason Wang 
235*a7053e6dSJason Wang void kvm_clock_clear(void *data)
236*a7053e6dSJason Wang {
237*a7053e6dSJason Wang         wrmsr(MSR_KVM_SYSTEM_TIME, 0LL);
238*a7053e6dSJason Wang }
239*a7053e6dSJason Wang 
240*a7053e6dSJason Wang void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
241*a7053e6dSJason Wang 			    struct pvclock_vcpu_time_info *vcpu_time,
242*a7053e6dSJason Wang 			    struct timespec *ts)
243*a7053e6dSJason Wang {
244*a7053e6dSJason Wang 	u32 version;
245*a7053e6dSJason Wang 	u64 delta;
246*a7053e6dSJason Wang 	struct timespec now;
247*a7053e6dSJason Wang 
248*a7053e6dSJason Wang 	/* get wallclock at system boot */
249*a7053e6dSJason Wang 	do {
250*a7053e6dSJason Wang 		version = wall_clock->version;
251*a7053e6dSJason Wang 		rmb();		/* fetch version before time */
252*a7053e6dSJason Wang 		now.tv_sec  = wall_clock->sec;
253*a7053e6dSJason Wang 		now.tv_nsec = wall_clock->nsec;
254*a7053e6dSJason Wang 		rmb();		/* fetch time before checking version */
255*a7053e6dSJason Wang 	} while ((wall_clock->version & 1) || (version != wall_clock->version));
256*a7053e6dSJason Wang 
257*a7053e6dSJason Wang 	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
258*a7053e6dSJason Wang 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
259*a7053e6dSJason Wang 
260*a7053e6dSJason Wang 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
261*a7053e6dSJason Wang 	now.tv_sec = delta;
262*a7053e6dSJason Wang 
263*a7053e6dSJason Wang 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
264*a7053e6dSJason Wang }
265*a7053e6dSJason Wang 
266*a7053e6dSJason Wang void kvm_get_wallclock(struct timespec *ts)
267*a7053e6dSJason Wang {
268*a7053e6dSJason Wang         struct pvclock_vcpu_time_info *vcpu_time;
269*a7053e6dSJason Wang         int index = smp_id();
270*a7053e6dSJason Wang 
271*a7053e6dSJason Wang         wrmsr(MSR_KVM_WALL_CLOCK, (unsigned long)&wall_clock);
272*a7053e6dSJason Wang         vcpu_time = &hv_clock[index];
273*a7053e6dSJason Wang         pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
274*a7053e6dSJason Wang }
275*a7053e6dSJason Wang 
276*a7053e6dSJason Wang void pvclock_set_flags(unsigned char flags)
277*a7053e6dSJason Wang {
278*a7053e6dSJason Wang         valid_flags = flags;
279*a7053e6dSJason Wang }
280