xref: /kvm-unit-tests/x86/kvmclock.c (revision 2b0de978b2392a2236bad9d0a560e184c45e1313)
1 #include "libcflat.h"
2 #include "smp.h"
3 #include "atomic.h"
4 #include "processor.h"
5 #include "kvmclock.h"
6 #include "asm/barrier.h"
7 
8 #define unlikely(x)	__builtin_expect(!!(x), 0)
9 #define likely(x)	__builtin_expect(!!(x), 1)
10 
11 
12 struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
13 struct pvclock_wall_clock wall_clock;
14 static unsigned char valid_flags = 0;
15 static atomic64_t last_value = ATOMIC64_INIT(0);
16 
17 /*
18  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
19  * yielding a 64-bit result.
20  */
21 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
22 {
23 	u64 product;
24 #ifdef __i386__
25 	u32 tmp1, tmp2;
26 #endif
27 
28 	if (shift < 0)
29 		delta >>= -shift;
30 	else
31 		delta <<= shift;
32 
33 #ifdef __i386__
34 	__asm__ (
35 		"mul  %5       ; "
36 		"mov  %4,%%eax ; "
37 		"mov  %%edx,%4 ; "
38 		"mul  %5       ; "
39 		"xor  %5,%5    ; "
40 		"add  %4,%%eax ; "
41 		"adc  %5,%%edx ; "
42 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
43 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
44 #elif defined(__x86_64__)
45 	__asm__ (
46 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
47 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
48 #else
49 #error implement me!
50 #endif
51 
52 	return product;
53 }
54 
55 #ifdef __i386__
56 # define do_div(n,base) ({					\
57 	u32 __base = (base);    				\
58 	u32 __rem;						\
59 	__rem = ((u64)(n)) % __base;                            \
60 	(n) = ((u64)(n)) / __base;				\
61 	__rem;							\
62  })
63 #else
64 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
65 {
66 	u64 rem = *n;
67 	u64 b = base;
68 	u64 res, d = 1;
69 	u32 high = rem >> 32;
70 
71 	/* Reduce the thing a bit first */
72 	res = 0;
73 	if (high >= base) {
74 		high /= base;
75 		res = (u64) high << 32;
76 		rem -= (u64) (high*base) << 32;
77 	}
78 
79 	while ((s64)b > 0 && b < rem) {
80 		b = b+b;
81 		d = d+d;
82 	}
83 
84 	do {
85 		if (rem >= b) {
86 			rem -= b;
87 			res += d;
88 		}
89 		b >>= 1;
90 		d >>= 1;
91 	} while (d);
92 
93 	*n = res;
94 	return rem;
95 }
96 
97 # define do_div(n,base) ({				\
98 	u32 __base = (base);    			\
99 	u32 __rem;					\
100 	(void)(((typeof((n)) *)0) == ((u64 *)0));	\
101 	if (likely(((n) >> 32) == 0)) {			\
102 		__rem = (u32)(n) % __base;		\
103 		(n) = (u32)(n) / __base;		\
104 	} else 						\
105 		__rem = __div64_32(&(n), __base);	\
106 	__rem;						\
107  })
108 #endif
109 
110 /**
111  * set_normalized_timespec - set timespec sec and nsec parts and normalize
112  *
113  * @ts:		pointer to timespec variable to be set
114  * @sec:	seconds to set
115  * @nsec:	nanoseconds to set
116  *
117  * Set seconds and nanoseconds field of a timespec variable and
118  * normalize to the timespec storage format
119  *
120  * Note: The tv_nsec part is always in the range of
121  *	0 <= tv_nsec < NSEC_PER_SEC
122  * For negative values only the tv_sec field is negative !
123  */
124 void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
125 {
126 	while (nsec >= NSEC_PER_SEC) {
127 		/*
128 		 * The following asm() prevents the compiler from
129 		 * optimising this loop into a modulo operation. See
130 		 * also __iter_div_u64_rem() in include/linux/time.h
131 		 */
132 		asm("" : "+rm"(nsec));
133 		nsec -= NSEC_PER_SEC;
134 		++sec;
135 	}
136 	while (nsec < 0) {
137 		asm("" : "+rm"(nsec));
138 		nsec += NSEC_PER_SEC;
139 		--sec;
140 	}
141 	ts->tv_sec = sec;
142 	ts->tv_nsec = nsec;
143 }
144 
145 static inline
146 unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
147 {
148 	unsigned version = src->version & ~1;
149 	/* Make sure that the version is read before the data. */
150 	smp_rmb();
151 	return version;
152 }
153 
154 static inline
155 bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
156 			unsigned version)
157 {
158 	/* Make sure that the version is re-read after the data. */
159 	smp_rmb();
160 	return version != src->version;
161 }
162 
163 static inline u64 rdtsc_ordered()
164 {
165 	/*
166 	 * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up
167 	 * to 2x speedup
168 	 */
169 	mb();
170 	return rdtsc();
171 }
172 
173 static inline
174 cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
175 {
176 	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
177 	cycle_t offset = scale_delta(delta, src->tsc_to_system_mul,
178 					     src->tsc_shift);
179 	return src->system_time + offset;
180 }
181 
182 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
183 {
184 	unsigned version;
185 	cycle_t ret;
186 	u64 last;
187 	u8 flags;
188 
189 	do {
190 		version = pvclock_read_begin(src);
191 		ret = __pvclock_read_cycles(src);
192 		flags = src->flags;
193 	} while (pvclock_read_retry(src, version));
194 
195 	if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
196             ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
197              (flags & PVCLOCK_TSC_STABLE_BIT)))
198                 return ret;
199 
200 	/*
201 	 * Assumption here is that last_value, a global accumulator, always goes
202 	 * forward. If we are less than that, we should not be much smaller.
203 	 * We assume there is an error marging we're inside, and then the correction
204 	 * does not sacrifice accuracy.
205 	 *
206 	 * For reads: global may have changed between test and return,
207 	 * but this means someone else updated poked the clock at a later time.
208 	 * We just need to make sure we are not seeing a backwards event.
209 	 *
210 	 * For updates: last_value = ret is not enough, since two vcpus could be
211 	 * updating at the same time, and one of them could be slightly behind,
212 	 * making the assumption that last_value always go forward fail to hold.
213 	 */
214 	last = atomic64_read(&last_value);
215 	do {
216 		if (ret < last)
217 			return last;
218 		last = atomic64_cmpxchg(&last_value, last, ret);
219 	} while (unlikely(last != ret));
220 
221 	return ret;
222 }
223 
224 cycle_t kvm_clock_read()
225 {
226         struct pvclock_vcpu_time_info *src;
227         cycle_t ret;
228         int index = smp_id();
229 
230         src = &hv_clock[index];
231         ret = pvclock_clocksource_read(src);
232         return ret;
233 }
234 
235 void kvm_clock_init(void *data)
236 {
237         int index = smp_id();
238         struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
239 
240         printf("kvm-clock: cpu %d, msr %p\n", index, hvc);
241         wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1);
242 }
243 
244 void kvm_clock_clear(void *data)
245 {
246         wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL);
247 }
248 
249 void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
250 			    struct pvclock_vcpu_time_info *vcpu_time,
251 			    struct timespec *ts)
252 {
253 	u32 version;
254 	u64 delta;
255 	struct timespec now;
256 
257 	/* get wallclock at system boot */
258 	do {
259 		version = wall_clock->version;
260 		rmb();		/* fetch version before time */
261 		now.tv_sec  = wall_clock->sec;
262 		now.tv_nsec = wall_clock->nsec;
263 		rmb();		/* fetch time before checking version */
264 	} while ((wall_clock->version & 1) || (version != wall_clock->version));
265 
266 	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
267 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
268 
269 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
270 	now.tv_sec = delta;
271 
272 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
273 }
274 
275 void kvm_get_wallclock(struct timespec *ts)
276 {
277         struct pvclock_vcpu_time_info *vcpu_time;
278         int index = smp_id();
279 
280         wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock);
281         vcpu_time = &hv_clock[index];
282         pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
283 }
284 
285 void pvclock_set_flags(unsigned char flags)
286 {
287         valid_flags = flags;
288 }
289