xref: /kvm-unit-tests/x86/kvmclock.c (revision a322d4c597bb7a4de7985e7b51b80504f7e4fdda)
1 #include "libcflat.h"
2 #include "smp.h"
3 #include "atomic.h"
4 #include "processor.h"
5 #include "kvmclock.h"
6 
7 #define unlikely(x)	__builtin_expect(!!(x), 0)
8 #define likely(x)	__builtin_expect(!!(x), 1)
9 
10 
11 struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
12 struct pvclock_wall_clock wall_clock;
13 static unsigned char valid_flags = 0;
14 static atomic64_t last_value = ATOMIC64_INIT(0);
15 
16 /*
17  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
18  * yielding a 64-bit result.
19  */
20 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
21 {
22 	u64 product;
23 #ifdef __i386__
24 	u32 tmp1, tmp2;
25 #endif
26 
27 	if (shift < 0)
28 		delta >>= -shift;
29 	else
30 		delta <<= shift;
31 
32 #ifdef __i386__
33 	__asm__ (
34 		"mul  %5       ; "
35 		"mov  %4,%%eax ; "
36 		"mov  %%edx,%4 ; "
37 		"mul  %5       ; "
38 		"xor  %5,%5    ; "
39 		"add  %4,%%eax ; "
40 		"adc  %5,%%edx ; "
41 		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
42 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
43 #elif defined(__x86_64__)
44 	__asm__ (
45 		"mul %%rdx ; shrd $32,%%rdx,%%rax"
46 		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
47 #else
48 #error implement me!
49 #endif
50 
51 	return product;
52 }
53 
54 #ifdef __i386__
55 # define do_div(n,base) ({					\
56 	u32 __base = (base);    				\
57 	u32 __rem;						\
58 	__rem = ((u64)(n)) % __base;                            \
59 	(n) = ((u64)(n)) / __base;				\
60 	__rem;							\
61  })
62 #else
63 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
64 {
65 	u64 rem = *n;
66 	u64 b = base;
67 	u64 res, d = 1;
68 	u32 high = rem >> 32;
69 
70 	/* Reduce the thing a bit first */
71 	res = 0;
72 	if (high >= base) {
73 		high /= base;
74 		res = (u64) high << 32;
75 		rem -= (u64) (high*base) << 32;
76 	}
77 
78 	while ((s64)b > 0 && b < rem) {
79 		b = b+b;
80 		d = d+d;
81 	}
82 
83 	do {
84 		if (rem >= b) {
85 			rem -= b;
86 			res += d;
87 		}
88 		b >>= 1;
89 		d >>= 1;
90 	} while (d);
91 
92 	*n = res;
93 	return rem;
94 }
95 
96 # define do_div(n,base) ({				\
97 	u32 __base = (base);    			\
98 	u32 __rem;					\
99 	(void)(((typeof((n)) *)0) == ((u64 *)0));	\
100 	if (likely(((n) >> 32) == 0)) {			\
101 		__rem = (u32)(n) % __base;		\
102 		(n) = (u32)(n) / __base;		\
103 	} else 						\
104 		__rem = __div64_32(&(n), __base);	\
105 	__rem;						\
106  })
107 #endif
108 
109 /**
110  * set_normalized_timespec - set timespec sec and nsec parts and normalize
111  *
112  * @ts:		pointer to timespec variable to be set
113  * @sec:	seconds to set
114  * @nsec:	nanoseconds to set
115  *
116  * Set seconds and nanoseconds field of a timespec variable and
117  * normalize to the timespec storage format
118  *
119  * Note: The tv_nsec part is always in the range of
120  *	0 <= tv_nsec < NSEC_PER_SEC
121  * For negative values only the tv_sec field is negative !
122  */
123 void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
124 {
125 	while (nsec >= NSEC_PER_SEC) {
126 		/*
127 		 * The following asm() prevents the compiler from
128 		 * optimising this loop into a modulo operation. See
129 		 * also __iter_div_u64_rem() in include/linux/time.h
130 		 */
131 		asm("" : "+rm"(nsec));
132 		nsec -= NSEC_PER_SEC;
133 		++sec;
134 	}
135 	while (nsec < 0) {
136 		asm("" : "+rm"(nsec));
137 		nsec += NSEC_PER_SEC;
138 		--sec;
139 	}
140 	ts->tv_sec = sec;
141 	ts->tv_nsec = nsec;
142 }
143 
144 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
145 {
146 	u64 delta = rdtsc() - shadow->tsc_timestamp;
147 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
148 }
149 
150 /*
151  * Reads a consistent set of time-base values from hypervisor,
152  * into a shadow data area.
153  */
154 static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
155 					struct pvclock_vcpu_time_info *src)
156 {
157 	do {
158 		dst->version = src->version;
159 		rmb();		/* fetch version before data */
160 		dst->tsc_timestamp     = src->tsc_timestamp;
161 		dst->system_timestamp  = src->system_time;
162 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
163 		dst->tsc_shift         = src->tsc_shift;
164 		dst->flags             = src->flags;
165 		rmb();		/* test version after fetching data */
166 	} while ((src->version & 1) || (dst->version != src->version));
167 
168 	return dst->version;
169 }
170 
171 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
172 {
173 	struct pvclock_shadow_time shadow;
174 	unsigned version;
175 	cycle_t ret, offset;
176 	u64 last;
177 
178 	do {
179 		version = pvclock_get_time_values(&shadow, src);
180 		mb();
181 		offset = pvclock_get_nsec_offset(&shadow);
182 		ret = shadow.system_timestamp + offset;
183 		mb();
184 	} while (version != src->version);
185 
186 	if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
187             ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
188              (shadow.flags & PVCLOCK_TSC_STABLE_BIT)))
189                 return ret;
190 
191 	/*
192 	 * Assumption here is that last_value, a global accumulator, always goes
193 	 * forward. If we are less than that, we should not be much smaller.
194 	 * We assume there is an error marging we're inside, and then the correction
195 	 * does not sacrifice accuracy.
196 	 *
197 	 * For reads: global may have changed between test and return,
198 	 * but this means someone else updated poked the clock at a later time.
199 	 * We just need to make sure we are not seeing a backwards event.
200 	 *
201 	 * For updates: last_value = ret is not enough, since two vcpus could be
202 	 * updating at the same time, and one of them could be slightly behind,
203 	 * making the assumption that last_value always go forward fail to hold.
204 	 */
205 	last = atomic64_read(&last_value);
206 	do {
207 		if (ret < last)
208 			return last;
209 		last = atomic64_cmpxchg(&last_value, last, ret);
210 	} while (unlikely(last != ret));
211 
212 	return ret;
213 }
214 
215 cycle_t kvm_clock_read()
216 {
217         struct pvclock_vcpu_time_info *src;
218         cycle_t ret;
219         int index = smp_id();
220 
221         src = &hv_clock[index];
222         ret = pvclock_clocksource_read(src);
223         return ret;
224 }
225 
226 void kvm_clock_init(void *data)
227 {
228         int index = smp_id();
229         struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
230 
231         printf("kvm-clock: cpu %d, msr 0x:%lx \n", index, hvc);
232         wrmsr(MSR_KVM_SYSTEM_TIME, (unsigned long)hvc | 1);
233 }
234 
235 void kvm_clock_clear(void *data)
236 {
237         wrmsr(MSR_KVM_SYSTEM_TIME, 0LL);
238 }
239 
240 void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
241 			    struct pvclock_vcpu_time_info *vcpu_time,
242 			    struct timespec *ts)
243 {
244 	u32 version;
245 	u64 delta;
246 	struct timespec now;
247 
248 	/* get wallclock at system boot */
249 	do {
250 		version = wall_clock->version;
251 		rmb();		/* fetch version before time */
252 		now.tv_sec  = wall_clock->sec;
253 		now.tv_nsec = wall_clock->nsec;
254 		rmb();		/* fetch time before checking version */
255 	} while ((wall_clock->version & 1) || (version != wall_clock->version));
256 
257 	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
258 	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
259 
260 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
261 	now.tv_sec = delta;
262 
263 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
264 }
265 
266 void kvm_get_wallclock(struct timespec *ts)
267 {
268         struct pvclock_vcpu_time_info *vcpu_time;
269         int index = smp_id();
270 
271         wrmsr(MSR_KVM_WALL_CLOCK, (unsigned long)&wall_clock);
272         vcpu_time = &hv_clock[index];
273         pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
274 }
275 
276 void pvclock_set_flags(unsigned char flags)
277 {
278         valid_flags = flags;
279 }
280