xref: /kvm-unit-tests/x86/kvmclock.c (revision e7235264c197cddd5bc1a2d1e05dca049152942f)
1a7053e6dSJason Wang #include "libcflat.h"
2a7053e6dSJason Wang #include "smp.h"
3a7053e6dSJason Wang #include "atomic.h"
4a7053e6dSJason Wang #include "processor.h"
5a7053e6dSJason Wang #include "kvmclock.h"
68226c540SAlexander Gordeev #include "asm/barrier.h"
7a7053e6dSJason Wang 
8a7053e6dSJason Wang #define unlikely(x)	__builtin_expect(!!(x), 0)
9a7053e6dSJason Wang #define likely(x)	__builtin_expect(!!(x), 1)
10a7053e6dSJason Wang 
11a7053e6dSJason Wang 
12a7053e6dSJason Wang struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
13a7053e6dSJason Wang struct pvclock_wall_clock wall_clock;
14a7053e6dSJason Wang static unsigned char valid_flags = 0;
15a7053e6dSJason Wang static atomic64_t last_value = ATOMIC64_INIT(0);
16a7053e6dSJason Wang 
17a7053e6dSJason Wang /*
18a7053e6dSJason Wang  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
19a7053e6dSJason Wang  * yielding a 64-bit result.
20a7053e6dSJason Wang  */
scale_delta(u64 delta,u32 mul_frac,int shift)21a7053e6dSJason Wang static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
22a7053e6dSJason Wang {
23a7053e6dSJason Wang 	u64 product;
24a7053e6dSJason Wang #ifdef __i386__
25a7053e6dSJason Wang 	u32 tmp1, tmp2;
26a7053e6dSJason Wang #endif
27a7053e6dSJason Wang 
28a7053e6dSJason Wang 	if (shift < 0)
29a7053e6dSJason Wang 		delta >>= -shift;
30a7053e6dSJason Wang 	else
31a7053e6dSJason Wang 		delta <<= shift;
32a7053e6dSJason Wang 
33a7053e6dSJason Wang #ifdef __i386__
34a7053e6dSJason Wang 	__asm__ (
35a7053e6dSJason Wang 		"mul  %5       ; "
36a7053e6dSJason Wang 		"mov  %4,%%eax ; "
37a7053e6dSJason Wang 		"mov  %%edx,%4 ; "
38a7053e6dSJason Wang 		"mul  %5       ; "
39a7053e6dSJason Wang 		"xor  %5,%5    ; "
40a7053e6dSJason Wang 		"add  %4,%%eax ; "
41a7053e6dSJason Wang 		"adc  %5,%%edx ; "
42a7053e6dSJason Wang 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
43a7053e6dSJason Wang 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
44a7053e6dSJason Wang #elif defined(__x86_64__)
45a7053e6dSJason Wang 	__asm__ (
46a7053e6dSJason Wang 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
47a7053e6dSJason Wang 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
48a7053e6dSJason Wang #else
49a7053e6dSJason Wang #error implement me!
50a7053e6dSJason Wang #endif
51a7053e6dSJason Wang 
52a7053e6dSJason Wang 	return product;
53a7053e6dSJason Wang }
54a7053e6dSJason Wang 
55a7053e6dSJason Wang #ifdef __i386__
56a7053e6dSJason Wang # define do_div(n,base) ({					\
57a7053e6dSJason Wang 	u32 __base = (base);    				\
58a7053e6dSJason Wang 	u32 __rem;						\
59a7053e6dSJason Wang 	__rem = ((u64)(n)) % __base;                            \
60a7053e6dSJason Wang 	(n) = ((u64)(n)) / __base;				\
61a7053e6dSJason Wang 	__rem;							\
62a7053e6dSJason Wang  })
63a7053e6dSJason Wang #else
6406846df5SThomas Huth u32 __attribute__((weak)) __div64_32(u64 *n, u32 base);
__div64_32(u64 * n,u32 base)65a7053e6dSJason Wang u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
66a7053e6dSJason Wang {
67a7053e6dSJason Wang 	u64 rem = *n;
68a7053e6dSJason Wang 	u64 b = base;
69a7053e6dSJason Wang 	u64 res, d = 1;
70a7053e6dSJason Wang 	u32 high = rem >> 32;
71a7053e6dSJason Wang 
72a7053e6dSJason Wang 	/* Reduce the thing a bit first */
73a7053e6dSJason Wang 	res = 0;
74a7053e6dSJason Wang 	if (high >= base) {
75a7053e6dSJason Wang 		high /= base;
76a7053e6dSJason Wang 		res = (u64) high << 32;
77a7053e6dSJason Wang 		rem -= (u64) (high*base) << 32;
78a7053e6dSJason Wang 	}
79a7053e6dSJason Wang 
80a7053e6dSJason Wang 	while ((s64)b > 0 && b < rem) {
81a7053e6dSJason Wang 		b = b+b;
82a7053e6dSJason Wang 		d = d+d;
83a7053e6dSJason Wang 	}
84a7053e6dSJason Wang 
85a7053e6dSJason Wang 	do {
86a7053e6dSJason Wang 		if (rem >= b) {
87a7053e6dSJason Wang 			rem -= b;
88a7053e6dSJason Wang 			res += d;
89a7053e6dSJason Wang 		}
90a7053e6dSJason Wang 		b >>= 1;
91a7053e6dSJason Wang 		d >>= 1;
92a7053e6dSJason Wang 	} while (d);
93a7053e6dSJason Wang 
94a7053e6dSJason Wang 	*n = res;
95a7053e6dSJason Wang 	return rem;
96a7053e6dSJason Wang }
97a7053e6dSJason Wang 
98a7053e6dSJason Wang # define do_div(n,base) ({				\
99a7053e6dSJason Wang 	u32 __base = (base);    			\
100a7053e6dSJason Wang 	u32 __rem;					\
101a7053e6dSJason Wang 	(void)(((typeof((n)) *)0) == ((u64 *)0));	\
102a7053e6dSJason Wang 	if (likely(((n) >> 32) == 0)) {			\
103a7053e6dSJason Wang 		__rem = (u32)(n) % __base;		\
104a7053e6dSJason Wang 		(n) = (u32)(n) / __base;		\
105a7053e6dSJason Wang 	} else 						\
106a7053e6dSJason Wang 		__rem = __div64_32(&(n), __base);	\
107a7053e6dSJason Wang 	__rem;						\
108a7053e6dSJason Wang  })
109a7053e6dSJason Wang #endif
110a7053e6dSJason Wang 
111a7053e6dSJason Wang /**
112a7053e6dSJason Wang  * set_normalized_timespec - set timespec sec and nsec parts and normalize
113a7053e6dSJason Wang  *
114a7053e6dSJason Wang  * @ts:		pointer to timespec variable to be set
115a7053e6dSJason Wang  * @sec:	seconds to set
116a7053e6dSJason Wang  * @nsec:	nanoseconds to set
117a7053e6dSJason Wang  *
118a7053e6dSJason Wang  * Set seconds and nanoseconds field of a timespec variable and
119a7053e6dSJason Wang  * normalize to the timespec storage format
120a7053e6dSJason Wang  *
121a7053e6dSJason Wang  * Note: The tv_nsec part is always in the range of
122a7053e6dSJason Wang  *	0 <= tv_nsec < NSEC_PER_SEC
123a7053e6dSJason Wang  * For negative values only the tv_sec field is negative !
124a7053e6dSJason Wang  */
set_normalized_timespec(struct timespec * ts,long sec,s64 nsec)12506846df5SThomas Huth static void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
126a7053e6dSJason Wang {
127a7053e6dSJason Wang 	while (nsec >= NSEC_PER_SEC) {
128a7053e6dSJason Wang 		/*
129a7053e6dSJason Wang 		 * The following asm() prevents the compiler from
130a7053e6dSJason Wang 		 * optimising this loop into a modulo operation. See
131a7053e6dSJason Wang 		 * also __iter_div_u64_rem() in include/linux/time.h
132a7053e6dSJason Wang 		 */
133a7053e6dSJason Wang 		asm("" : "+rm"(nsec));
134a7053e6dSJason Wang 		nsec -= NSEC_PER_SEC;
135a7053e6dSJason Wang 		++sec;
136a7053e6dSJason Wang 	}
137a7053e6dSJason Wang 	while (nsec < 0) {
138a7053e6dSJason Wang 		asm("" : "+rm"(nsec));
139a7053e6dSJason Wang 		nsec += NSEC_PER_SEC;
140a7053e6dSJason Wang 		--sec;
141a7053e6dSJason Wang 	}
142a7053e6dSJason Wang 	ts->tv_sec = sec;
143a7053e6dSJason Wang 	ts->tv_nsec = nsec;
144a7053e6dSJason Wang }
145a7053e6dSJason Wang 
146c7853445SRoman Kagan static inline
pvclock_read_begin(const struct pvclock_vcpu_time_info * src)147c7853445SRoman Kagan unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
148a7053e6dSJason Wang {
149c7853445SRoman Kagan 	unsigned version = src->version & ~1;
150c7853445SRoman Kagan 	/* Make sure that the version is read before the data. */
151c7853445SRoman Kagan 	smp_rmb();
152c7853445SRoman Kagan 	return version;
153a7053e6dSJason Wang }
154a7053e6dSJason Wang 
155c7853445SRoman Kagan static inline
pvclock_read_retry(const struct pvclock_vcpu_time_info * src,unsigned version)156c7853445SRoman Kagan bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
157c7853445SRoman Kagan 			unsigned version)
158a7053e6dSJason Wang {
159c7853445SRoman Kagan 	/* Make sure that the version is re-read after the data. */
160c7853445SRoman Kagan 	smp_rmb();
161c7853445SRoman Kagan 	return version != src->version;
162c7853445SRoman Kagan }
163a7053e6dSJason Wang 
rdtsc_ordered(void)1647db17e21SThomas Huth static inline u64 rdtsc_ordered(void)
165c7853445SRoman Kagan {
166c7853445SRoman Kagan 	/*
167c7853445SRoman Kagan 	 * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up
168c7853445SRoman Kagan 	 * to 2x speedup
169c7853445SRoman Kagan 	 */
170c7853445SRoman Kagan 	mb();
171c7853445SRoman Kagan 	return rdtsc();
172c7853445SRoman Kagan }
173c7853445SRoman Kagan 
174c7853445SRoman Kagan static inline
__pvclock_read_cycles(const struct pvclock_vcpu_time_info * src)175c7853445SRoman Kagan cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
176c7853445SRoman Kagan {
177c7853445SRoman Kagan 	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
178c7853445SRoman Kagan 	cycle_t offset = scale_delta(delta, src->tsc_to_system_mul,
179c7853445SRoman Kagan 					     src->tsc_shift);
180c7853445SRoman Kagan 	return src->system_time + offset;
181a7053e6dSJason Wang }
182a7053e6dSJason Wang 
pvclock_clocksource_read(struct pvclock_vcpu_time_info * src)18306846df5SThomas Huth static cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
184a7053e6dSJason Wang {
185a7053e6dSJason Wang 	unsigned version;
186c7853445SRoman Kagan 	cycle_t ret;
187a7053e6dSJason Wang 	u64 last;
188c7853445SRoman Kagan 	u8 flags;
189a7053e6dSJason Wang 
190a7053e6dSJason Wang 	do {
191c7853445SRoman Kagan 		version = pvclock_read_begin(src);
192c7853445SRoman Kagan 		ret = __pvclock_read_cycles(src);
193c7853445SRoman Kagan 		flags = src->flags;
194c7853445SRoman Kagan 	} while (pvclock_read_retry(src, version));
195a7053e6dSJason Wang 
196a7053e6dSJason Wang 	if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
197a7053e6dSJason Wang             ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
198c7853445SRoman Kagan              (flags & PVCLOCK_TSC_STABLE_BIT)))
199a7053e6dSJason Wang                 return ret;
200a7053e6dSJason Wang 
201a7053e6dSJason Wang 	/*
202a7053e6dSJason Wang 	 * Assumption here is that last_value, a global accumulator, always goes
203a7053e6dSJason Wang 	 * forward. If we are less than that, we should not be much smaller.
204912c0d72SThomas Huth 	 * We assume there is an error margin we're inside, and then the
205912c0d72SThomas Huth 	 * correction does not sacrifice accuracy.
206a7053e6dSJason Wang 	 *
207a7053e6dSJason Wang 	 * For reads: global may have changed between test and return,
208a7053e6dSJason Wang 	 * but this means someone else updated poked the clock at a later time.
209a7053e6dSJason Wang 	 * We just need to make sure we are not seeing a backwards event.
210a7053e6dSJason Wang 	 *
211a7053e6dSJason Wang 	 * For updates: last_value = ret is not enough, since two vcpus could be
212a7053e6dSJason Wang 	 * updating at the same time, and one of them could be slightly behind,
213a7053e6dSJason Wang 	 * making the assumption that last_value always go forward fail to hold.
214a7053e6dSJason Wang 	 */
215a7053e6dSJason Wang 	last = atomic64_read(&last_value);
216a7053e6dSJason Wang 	do {
217a7053e6dSJason Wang 		if (ret < last)
218a7053e6dSJason Wang 			return last;
219a7053e6dSJason Wang 		last = atomic64_cmpxchg(&last_value, last, ret);
220a7053e6dSJason Wang 	} while (unlikely(last != ret));
221a7053e6dSJason Wang 
222a7053e6dSJason Wang 	return ret;
223a7053e6dSJason Wang }
224a7053e6dSJason Wang 
kvm_clock_read(void)225*e7235264SJim Mattson cycle_t kvm_clock_read(void)
226a7053e6dSJason Wang {
227a7053e6dSJason Wang         struct pvclock_vcpu_time_info *src;
228a7053e6dSJason Wang         cycle_t ret;
229a7053e6dSJason Wang         int index = smp_id();
230a7053e6dSJason Wang 
231a7053e6dSJason Wang         src = &hv_clock[index];
232a7053e6dSJason Wang         ret = pvclock_clocksource_read(src);
233a7053e6dSJason Wang         return ret;
234a7053e6dSJason Wang }
235a7053e6dSJason Wang 
kvm_clock_init(void * data)236a7053e6dSJason Wang void kvm_clock_init(void *data)
237a7053e6dSJason Wang {
238a7053e6dSJason Wang         int index = smp_id();
239a7053e6dSJason Wang         struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
240a7053e6dSJason Wang 
241b006d7ebSAndrew Jones         printf("kvm-clock: cpu %d, msr %p\n", index, hvc);
242b4711e11SDavid Matlack         wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1);
243a7053e6dSJason Wang }
244a7053e6dSJason Wang 
kvm_clock_clear(void * data)245a7053e6dSJason Wang void kvm_clock_clear(void *data)
246a7053e6dSJason Wang {
247b4711e11SDavid Matlack         wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL);
248a7053e6dSJason Wang }
249a7053e6dSJason Wang 
pvclock_read_wallclock(struct pvclock_wall_clock * wall_clock,struct pvclock_vcpu_time_info * vcpu_time,struct timespec * ts)25006846df5SThomas Huth static void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
251a7053e6dSJason Wang 				   struct pvclock_vcpu_time_info *vcpu_time,
252a7053e6dSJason Wang 				   struct timespec *ts)
253a7053e6dSJason Wang {
254a7053e6dSJason Wang 	u32 version;
255a7053e6dSJason Wang 	u64 delta;
256a7053e6dSJason Wang 	struct timespec now;
257a7053e6dSJason Wang 
258a7053e6dSJason Wang 	/* get wallclock at system boot */
259a7053e6dSJason Wang 	do {
260a7053e6dSJason Wang 		version = wall_clock->version;
261a7053e6dSJason Wang 		rmb();		/* fetch version before time */
262a7053e6dSJason Wang 		now.tv_sec  = wall_clock->sec;
263a7053e6dSJason Wang 		now.tv_nsec = wall_clock->nsec;
264a7053e6dSJason Wang 		rmb();		/* fetch time before checking version */
265a7053e6dSJason Wang 	} while ((wall_clock->version & 1) || (version != wall_clock->version));
266a7053e6dSJason Wang 
267a7053e6dSJason Wang 	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
268a7053e6dSJason Wang 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
269a7053e6dSJason Wang 
270a7053e6dSJason Wang 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
271a7053e6dSJason Wang 	now.tv_sec = delta;
272a7053e6dSJason Wang 
273a7053e6dSJason Wang 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
274a7053e6dSJason Wang }
275a7053e6dSJason Wang 
kvm_get_wallclock(struct timespec * ts)276a7053e6dSJason Wang void kvm_get_wallclock(struct timespec *ts)
277a7053e6dSJason Wang {
278a7053e6dSJason Wang         struct pvclock_vcpu_time_info *vcpu_time;
279a7053e6dSJason Wang         int index = smp_id();
280a7053e6dSJason Wang 
281b4711e11SDavid Matlack         wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock);
282a7053e6dSJason Wang         vcpu_time = &hv_clock[index];
283a7053e6dSJason Wang         pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
284a7053e6dSJason Wang }
285a7053e6dSJason Wang 
pvclock_set_flags(unsigned char flags)286a7053e6dSJason Wang void pvclock_set_flags(unsigned char flags)
287a7053e6dSJason Wang {
288a7053e6dSJason Wang         valid_flags = flags;
289a7053e6dSJason Wang }
290