1 #include "libcflat.h"
2 #include "smp.h"
3 #include "atomic.h"
4 #include "processor.h"
5 #include "kvmclock.h"
6 #include "asm/barrier.h"
7
8 #define unlikely(x) __builtin_expect(!!(x), 0)
9 #define likely(x) __builtin_expect(!!(x), 1)
10
11
12 struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
13 struct pvclock_wall_clock wall_clock;
14 static unsigned char valid_flags = 0;
15 static atomic64_t last_value = ATOMIC64_INIT(0);
16
17 /*
18 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
19 * yielding a 64-bit result.
20 */
scale_delta(u64 delta,u32 mul_frac,int shift)21 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
22 {
23 u64 product;
24 #ifdef __i386__
25 u32 tmp1, tmp2;
26 #endif
27
28 if (shift < 0)
29 delta >>= -shift;
30 else
31 delta <<= shift;
32
33 #ifdef __i386__
34 __asm__ (
35 "mul %5 ; "
36 "mov %4,%%eax ; "
37 "mov %%edx,%4 ; "
38 "mul %5 ; "
39 "xor %5,%5 ; "
40 "add %4,%%eax ; "
41 "adc %5,%%edx ; "
42 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
43 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
44 #elif defined(__x86_64__)
45 __asm__ (
46 "mul %%rdx ; shrd $32,%%rdx,%%rax"
47 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
48 #else
49 #error implement me!
50 #endif
51
52 return product;
53 }
54
55 #ifdef __i386__
56 # define do_div(n,base) ({ \
57 u32 __base = (base); \
58 u32 __rem; \
59 __rem = ((u64)(n)) % __base; \
60 (n) = ((u64)(n)) / __base; \
61 __rem; \
62 })
63 #else
64 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base);
__div64_32(u64 * n,u32 base)65 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
66 {
67 u64 rem = *n;
68 u64 b = base;
69 u64 res, d = 1;
70 u32 high = rem >> 32;
71
72 /* Reduce the thing a bit first */
73 res = 0;
74 if (high >= base) {
75 high /= base;
76 res = (u64) high << 32;
77 rem -= (u64) (high*base) << 32;
78 }
79
80 while ((s64)b > 0 && b < rem) {
81 b = b+b;
82 d = d+d;
83 }
84
85 do {
86 if (rem >= b) {
87 rem -= b;
88 res += d;
89 }
90 b >>= 1;
91 d >>= 1;
92 } while (d);
93
94 *n = res;
95 return rem;
96 }
97
98 # define do_div(n,base) ({ \
99 u32 __base = (base); \
100 u32 __rem; \
101 (void)(((typeof((n)) *)0) == ((u64 *)0)); \
102 if (likely(((n) >> 32) == 0)) { \
103 __rem = (u32)(n) % __base; \
104 (n) = (u32)(n) / __base; \
105 } else \
106 __rem = __div64_32(&(n), __base); \
107 __rem; \
108 })
109 #endif
110
111 /**
112 * set_normalized_timespec - set timespec sec and nsec parts and normalize
113 *
114 * @ts: pointer to timespec variable to be set
115 * @sec: seconds to set
116 * @nsec: nanoseconds to set
117 *
118 * Set seconds and nanoseconds field of a timespec variable and
119 * normalize to the timespec storage format
120 *
121 * Note: The tv_nsec part is always in the range of
122 * 0 <= tv_nsec < NSEC_PER_SEC
123 * For negative values only the tv_sec field is negative !
124 */
set_normalized_timespec(struct timespec * ts,long sec,s64 nsec)125 static void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
126 {
127 while (nsec >= NSEC_PER_SEC) {
128 /*
129 * The following asm() prevents the compiler from
130 * optimising this loop into a modulo operation. See
131 * also __iter_div_u64_rem() in include/linux/time.h
132 */
133 asm("" : "+rm"(nsec));
134 nsec -= NSEC_PER_SEC;
135 ++sec;
136 }
137 while (nsec < 0) {
138 asm("" : "+rm"(nsec));
139 nsec += NSEC_PER_SEC;
140 --sec;
141 }
142 ts->tv_sec = sec;
143 ts->tv_nsec = nsec;
144 }
145
146 static inline
pvclock_read_begin(const struct pvclock_vcpu_time_info * src)147 unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
148 {
149 unsigned version = src->version & ~1;
150 /* Make sure that the version is read before the data. */
151 smp_rmb();
152 return version;
153 }
154
155 static inline
pvclock_read_retry(const struct pvclock_vcpu_time_info * src,unsigned version)156 bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
157 unsigned version)
158 {
159 /* Make sure that the version is re-read after the data. */
160 smp_rmb();
161 return version != src->version;
162 }
163
rdtsc_ordered(void)164 static inline u64 rdtsc_ordered(void)
165 {
166 /*
167 * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up
168 * to 2x speedup
169 */
170 mb();
171 return rdtsc();
172 }
173
174 static inline
__pvclock_read_cycles(const struct pvclock_vcpu_time_info * src)175 cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
176 {
177 u64 delta = rdtsc_ordered() - src->tsc_timestamp;
178 cycle_t offset = scale_delta(delta, src->tsc_to_system_mul,
179 src->tsc_shift);
180 return src->system_time + offset;
181 }
182
pvclock_clocksource_read(struct pvclock_vcpu_time_info * src)183 static cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
184 {
185 unsigned version;
186 cycle_t ret;
187 u64 last;
188 u8 flags;
189
190 do {
191 version = pvclock_read_begin(src);
192 ret = __pvclock_read_cycles(src);
193 flags = src->flags;
194 } while (pvclock_read_retry(src, version));
195
196 if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
197 ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
198 (flags & PVCLOCK_TSC_STABLE_BIT)))
199 return ret;
200
201 /*
202 * Assumption here is that last_value, a global accumulator, always goes
203 * forward. If we are less than that, we should not be much smaller.
204 * We assume there is an error margin we're inside, and then the
205 * correction does not sacrifice accuracy.
206 *
207 * For reads: global may have changed between test and return,
208 * but this means someone else updated poked the clock at a later time.
209 * We just need to make sure we are not seeing a backwards event.
210 *
211 * For updates: last_value = ret is not enough, since two vcpus could be
212 * updating at the same time, and one of them could be slightly behind,
213 * making the assumption that last_value always go forward fail to hold.
214 */
215 last = atomic64_read(&last_value);
216 do {
217 if (ret < last)
218 return last;
219 last = atomic64_cmpxchg(&last_value, last, ret);
220 } while (unlikely(last != ret));
221
222 return ret;
223 }
224
kvm_clock_read(void)225 cycle_t kvm_clock_read(void)
226 {
227 struct pvclock_vcpu_time_info *src;
228 cycle_t ret;
229 int index = smp_id();
230
231 src = &hv_clock[index];
232 ret = pvclock_clocksource_read(src);
233 return ret;
234 }
235
kvm_clock_init(void * data)236 void kvm_clock_init(void *data)
237 {
238 int index = smp_id();
239 struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
240
241 printf("kvm-clock: cpu %d, msr %p\n", index, hvc);
242 wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1);
243 }
244
kvm_clock_clear(void * data)245 void kvm_clock_clear(void *data)
246 {
247 wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL);
248 }
249
pvclock_read_wallclock(struct pvclock_wall_clock * wall_clock,struct pvclock_vcpu_time_info * vcpu_time,struct timespec * ts)250 static void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
251 struct pvclock_vcpu_time_info *vcpu_time,
252 struct timespec *ts)
253 {
254 u32 version;
255 u64 delta;
256 struct timespec now;
257
258 /* get wallclock at system boot */
259 do {
260 version = wall_clock->version;
261 rmb(); /* fetch version before time */
262 now.tv_sec = wall_clock->sec;
263 now.tv_nsec = wall_clock->nsec;
264 rmb(); /* fetch time before checking version */
265 } while ((wall_clock->version & 1) || (version != wall_clock->version));
266
267 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
268 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
269
270 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
271 now.tv_sec = delta;
272
273 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
274 }
275
kvm_get_wallclock(struct timespec * ts)276 void kvm_get_wallclock(struct timespec *ts)
277 {
278 struct pvclock_vcpu_time_info *vcpu_time;
279 int index = smp_id();
280
281 wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock);
282 vcpu_time = &hv_clock[index];
283 pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
284 }
285
pvclock_set_flags(unsigned char flags)286 void pvclock_set_flags(unsigned char flags)
287 {
288 valid_flags = flags;
289 }
290