1a7053e6dSJason Wang #include "libcflat.h" 2a7053e6dSJason Wang #include "smp.h" 3a7053e6dSJason Wang #include "atomic.h" 4a7053e6dSJason Wang #include "processor.h" 5a7053e6dSJason Wang #include "kvmclock.h" 68226c540SAlexander Gordeev #include "asm/barrier.h" 7a7053e6dSJason Wang 8a7053e6dSJason Wang #define unlikely(x) __builtin_expect(!!(x), 0) 9a7053e6dSJason Wang #define likely(x) __builtin_expect(!!(x), 1) 10a7053e6dSJason Wang 11a7053e6dSJason Wang 12a7053e6dSJason Wang struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; 13a7053e6dSJason Wang struct pvclock_wall_clock wall_clock; 14a7053e6dSJason Wang static unsigned char valid_flags = 0; 15a7053e6dSJason Wang static atomic64_t last_value = ATOMIC64_INIT(0); 16a7053e6dSJason Wang 17a7053e6dSJason Wang /* 18a7053e6dSJason Wang * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 19a7053e6dSJason Wang * yielding a 64-bit result. 20a7053e6dSJason Wang */ 21a7053e6dSJason Wang static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) 22a7053e6dSJason Wang { 23a7053e6dSJason Wang u64 product; 24a7053e6dSJason Wang #ifdef __i386__ 25a7053e6dSJason Wang u32 tmp1, tmp2; 26a7053e6dSJason Wang #endif 27a7053e6dSJason Wang 28a7053e6dSJason Wang if (shift < 0) 29a7053e6dSJason Wang delta >>= -shift; 30a7053e6dSJason Wang else 31a7053e6dSJason Wang delta <<= shift; 32a7053e6dSJason Wang 33a7053e6dSJason Wang #ifdef __i386__ 34a7053e6dSJason Wang __asm__ ( 35a7053e6dSJason Wang "mul %5 ; " 36a7053e6dSJason Wang "mov %4,%%eax ; " 37a7053e6dSJason Wang "mov %%edx,%4 ; " 38a7053e6dSJason Wang "mul %5 ; " 39a7053e6dSJason Wang "xor %5,%5 ; " 40a7053e6dSJason Wang "add %4,%%eax ; " 41a7053e6dSJason Wang "adc %5,%%edx ; " 42a7053e6dSJason Wang : "=A" (product), "=r" (tmp1), "=r" (tmp2) 43a7053e6dSJason Wang : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 44a7053e6dSJason Wang #elif defined(__x86_64__) 45a7053e6dSJason Wang __asm__ ( 46a7053e6dSJason Wang "mul %%rdx ; shrd $32,%%rdx,%%rax" 47a7053e6dSJason Wang : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 48a7053e6dSJason Wang #else 49a7053e6dSJason Wang #error implement me! 50a7053e6dSJason Wang #endif 51a7053e6dSJason Wang 52a7053e6dSJason Wang return product; 53a7053e6dSJason Wang } 54a7053e6dSJason Wang 55a7053e6dSJason Wang #ifdef __i386__ 56a7053e6dSJason Wang # define do_div(n,base) ({ \ 57a7053e6dSJason Wang u32 __base = (base); \ 58a7053e6dSJason Wang u32 __rem; \ 59a7053e6dSJason Wang __rem = ((u64)(n)) % __base; \ 60a7053e6dSJason Wang (n) = ((u64)(n)) / __base; \ 61a7053e6dSJason Wang __rem; \ 62a7053e6dSJason Wang }) 63a7053e6dSJason Wang #else 64*06846df5SThomas Huth u32 __attribute__((weak)) __div64_32(u64 *n, u32 base); 65a7053e6dSJason Wang u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) 66a7053e6dSJason Wang { 67a7053e6dSJason Wang u64 rem = *n; 68a7053e6dSJason Wang u64 b = base; 69a7053e6dSJason Wang u64 res, d = 1; 70a7053e6dSJason Wang u32 high = rem >> 32; 71a7053e6dSJason Wang 72a7053e6dSJason Wang /* Reduce the thing a bit first */ 73a7053e6dSJason Wang res = 0; 74a7053e6dSJason Wang if (high >= base) { 75a7053e6dSJason Wang high /= base; 76a7053e6dSJason Wang res = (u64) high << 32; 77a7053e6dSJason Wang rem -= (u64) (high*base) << 32; 78a7053e6dSJason Wang } 79a7053e6dSJason Wang 80a7053e6dSJason Wang while ((s64)b > 0 && b < rem) { 81a7053e6dSJason Wang b = b+b; 82a7053e6dSJason Wang d = d+d; 83a7053e6dSJason Wang } 84a7053e6dSJason Wang 85a7053e6dSJason Wang do { 86a7053e6dSJason Wang if (rem >= b) { 87a7053e6dSJason Wang rem -= b; 88a7053e6dSJason Wang res += d; 89a7053e6dSJason Wang } 90a7053e6dSJason Wang b >>= 1; 91a7053e6dSJason Wang d >>= 1; 92a7053e6dSJason Wang } while (d); 93a7053e6dSJason Wang 94a7053e6dSJason Wang *n = res; 95a7053e6dSJason Wang return rem; 96a7053e6dSJason Wang } 97a7053e6dSJason Wang 98a7053e6dSJason Wang # define do_div(n,base) ({ \ 99a7053e6dSJason Wang u32 __base = (base); \ 100a7053e6dSJason Wang u32 __rem; \ 101a7053e6dSJason Wang (void)(((typeof((n)) *)0) == ((u64 *)0)); \ 102a7053e6dSJason Wang if (likely(((n) >> 32) == 0)) { \ 103a7053e6dSJason Wang __rem = (u32)(n) % __base; \ 104a7053e6dSJason Wang (n) = (u32)(n) / __base; \ 105a7053e6dSJason Wang } else \ 106a7053e6dSJason Wang __rem = __div64_32(&(n), __base); \ 107a7053e6dSJason Wang __rem; \ 108a7053e6dSJason Wang }) 109a7053e6dSJason Wang #endif 110a7053e6dSJason Wang 111a7053e6dSJason Wang /** 112a7053e6dSJason Wang * set_normalized_timespec - set timespec sec and nsec parts and normalize 113a7053e6dSJason Wang * 114a7053e6dSJason Wang * @ts: pointer to timespec variable to be set 115a7053e6dSJason Wang * @sec: seconds to set 116a7053e6dSJason Wang * @nsec: nanoseconds to set 117a7053e6dSJason Wang * 118a7053e6dSJason Wang * Set seconds and nanoseconds field of a timespec variable and 119a7053e6dSJason Wang * normalize to the timespec storage format 120a7053e6dSJason Wang * 121a7053e6dSJason Wang * Note: The tv_nsec part is always in the range of 122a7053e6dSJason Wang * 0 <= tv_nsec < NSEC_PER_SEC 123a7053e6dSJason Wang * For negative values only the tv_sec field is negative ! 124a7053e6dSJason Wang */ 125*06846df5SThomas Huth static void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) 126a7053e6dSJason Wang { 127a7053e6dSJason Wang while (nsec >= NSEC_PER_SEC) { 128a7053e6dSJason Wang /* 129a7053e6dSJason Wang * The following asm() prevents the compiler from 130a7053e6dSJason Wang * optimising this loop into a modulo operation. See 131a7053e6dSJason Wang * also __iter_div_u64_rem() in include/linux/time.h 132a7053e6dSJason Wang */ 133a7053e6dSJason Wang asm("" : "+rm"(nsec)); 134a7053e6dSJason Wang nsec -= NSEC_PER_SEC; 135a7053e6dSJason Wang ++sec; 136a7053e6dSJason Wang } 137a7053e6dSJason Wang while (nsec < 0) { 138a7053e6dSJason Wang asm("" : "+rm"(nsec)); 139a7053e6dSJason Wang nsec += NSEC_PER_SEC; 140a7053e6dSJason Wang --sec; 141a7053e6dSJason Wang } 142a7053e6dSJason Wang ts->tv_sec = sec; 143a7053e6dSJason Wang ts->tv_nsec = nsec; 144a7053e6dSJason Wang } 145a7053e6dSJason Wang 146c7853445SRoman Kagan static inline 147c7853445SRoman Kagan unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) 148a7053e6dSJason Wang { 149c7853445SRoman Kagan unsigned version = src->version & ~1; 150c7853445SRoman Kagan /* Make sure that the version is read before the data. */ 151c7853445SRoman Kagan smp_rmb(); 152c7853445SRoman Kagan return version; 153a7053e6dSJason Wang } 154a7053e6dSJason Wang 155c7853445SRoman Kagan static inline 156c7853445SRoman Kagan bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, 157c7853445SRoman Kagan unsigned version) 158a7053e6dSJason Wang { 159c7853445SRoman Kagan /* Make sure that the version is re-read after the data. */ 160c7853445SRoman Kagan smp_rmb(); 161c7853445SRoman Kagan return version != src->version; 162c7853445SRoman Kagan } 163a7053e6dSJason Wang 1647db17e21SThomas Huth static inline u64 rdtsc_ordered(void) 165c7853445SRoman Kagan { 166c7853445SRoman Kagan /* 167c7853445SRoman Kagan * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up 168c7853445SRoman Kagan * to 2x speedup 169c7853445SRoman Kagan */ 170c7853445SRoman Kagan mb(); 171c7853445SRoman Kagan return rdtsc(); 172c7853445SRoman Kagan } 173c7853445SRoman Kagan 174c7853445SRoman Kagan static inline 175c7853445SRoman Kagan cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) 176c7853445SRoman Kagan { 177c7853445SRoman Kagan u64 delta = rdtsc_ordered() - src->tsc_timestamp; 178c7853445SRoman Kagan cycle_t offset = scale_delta(delta, src->tsc_to_system_mul, 179c7853445SRoman Kagan src->tsc_shift); 180c7853445SRoman Kagan return src->system_time + offset; 181a7053e6dSJason Wang } 182a7053e6dSJason Wang 183*06846df5SThomas Huth static cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 184a7053e6dSJason Wang { 185a7053e6dSJason Wang unsigned version; 186c7853445SRoman Kagan cycle_t ret; 187a7053e6dSJason Wang u64 last; 188c7853445SRoman Kagan u8 flags; 189a7053e6dSJason Wang 190a7053e6dSJason Wang do { 191c7853445SRoman Kagan version = pvclock_read_begin(src); 192c7853445SRoman Kagan ret = __pvclock_read_cycles(src); 193c7853445SRoman Kagan flags = src->flags; 194c7853445SRoman Kagan } while (pvclock_read_retry(src, version)); 195a7053e6dSJason Wang 196a7053e6dSJason Wang if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || 197a7053e6dSJason Wang ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 198c7853445SRoman Kagan (flags & PVCLOCK_TSC_STABLE_BIT))) 199a7053e6dSJason Wang return ret; 200a7053e6dSJason Wang 201a7053e6dSJason Wang /* 202a7053e6dSJason Wang * Assumption here is that last_value, a global accumulator, always goes 203a7053e6dSJason Wang * forward. If we are less than that, we should not be much smaller. 204a7053e6dSJason Wang * We assume there is an error marging we're inside, and then the correction 205a7053e6dSJason Wang * does not sacrifice accuracy. 206a7053e6dSJason Wang * 207a7053e6dSJason Wang * For reads: global may have changed between test and return, 208a7053e6dSJason Wang * but this means someone else updated poked the clock at a later time. 209a7053e6dSJason Wang * We just need to make sure we are not seeing a backwards event. 210a7053e6dSJason Wang * 211a7053e6dSJason Wang * For updates: last_value = ret is not enough, since two vcpus could be 212a7053e6dSJason Wang * updating at the same time, and one of them could be slightly behind, 213a7053e6dSJason Wang * making the assumption that last_value always go forward fail to hold. 214a7053e6dSJason Wang */ 215a7053e6dSJason Wang last = atomic64_read(&last_value); 216a7053e6dSJason Wang do { 217a7053e6dSJason Wang if (ret < last) 218a7053e6dSJason Wang return last; 219a7053e6dSJason Wang last = atomic64_cmpxchg(&last_value, last, ret); 220a7053e6dSJason Wang } while (unlikely(last != ret)); 221a7053e6dSJason Wang 222a7053e6dSJason Wang return ret; 223a7053e6dSJason Wang } 224a7053e6dSJason Wang 225a7053e6dSJason Wang cycle_t kvm_clock_read() 226a7053e6dSJason Wang { 227a7053e6dSJason Wang struct pvclock_vcpu_time_info *src; 228a7053e6dSJason Wang cycle_t ret; 229a7053e6dSJason Wang int index = smp_id(); 230a7053e6dSJason Wang 231a7053e6dSJason Wang src = &hv_clock[index]; 232a7053e6dSJason Wang ret = pvclock_clocksource_read(src); 233a7053e6dSJason Wang return ret; 234a7053e6dSJason Wang } 235a7053e6dSJason Wang 236a7053e6dSJason Wang void kvm_clock_init(void *data) 237a7053e6dSJason Wang { 238a7053e6dSJason Wang int index = smp_id(); 239a7053e6dSJason Wang struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; 240a7053e6dSJason Wang 241b006d7ebSAndrew Jones printf("kvm-clock: cpu %d, msr %p\n", index, hvc); 242b4711e11SDavid Matlack wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1); 243a7053e6dSJason Wang } 244a7053e6dSJason Wang 245a7053e6dSJason Wang void kvm_clock_clear(void *data) 246a7053e6dSJason Wang { 247b4711e11SDavid Matlack wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL); 248a7053e6dSJason Wang } 249a7053e6dSJason Wang 250*06846df5SThomas Huth static void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, 251a7053e6dSJason Wang struct pvclock_vcpu_time_info *vcpu_time, 252a7053e6dSJason Wang struct timespec *ts) 253a7053e6dSJason Wang { 254a7053e6dSJason Wang u32 version; 255a7053e6dSJason Wang u64 delta; 256a7053e6dSJason Wang struct timespec now; 257a7053e6dSJason Wang 258a7053e6dSJason Wang /* get wallclock at system boot */ 259a7053e6dSJason Wang do { 260a7053e6dSJason Wang version = wall_clock->version; 261a7053e6dSJason Wang rmb(); /* fetch version before time */ 262a7053e6dSJason Wang now.tv_sec = wall_clock->sec; 263a7053e6dSJason Wang now.tv_nsec = wall_clock->nsec; 264a7053e6dSJason Wang rmb(); /* fetch time before checking version */ 265a7053e6dSJason Wang } while ((wall_clock->version & 1) || (version != wall_clock->version)); 266a7053e6dSJason Wang 267a7053e6dSJason Wang delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ 268a7053e6dSJason Wang delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 269a7053e6dSJason Wang 270a7053e6dSJason Wang now.tv_nsec = do_div(delta, NSEC_PER_SEC); 271a7053e6dSJason Wang now.tv_sec = delta; 272a7053e6dSJason Wang 273a7053e6dSJason Wang set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 274a7053e6dSJason Wang } 275a7053e6dSJason Wang 276a7053e6dSJason Wang void kvm_get_wallclock(struct timespec *ts) 277a7053e6dSJason Wang { 278a7053e6dSJason Wang struct pvclock_vcpu_time_info *vcpu_time; 279a7053e6dSJason Wang int index = smp_id(); 280a7053e6dSJason Wang 281b4711e11SDavid Matlack wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock); 282a7053e6dSJason Wang vcpu_time = &hv_clock[index]; 283a7053e6dSJason Wang pvclock_read_wallclock(&wall_clock, vcpu_time, ts); 284a7053e6dSJason Wang } 285a7053e6dSJason Wang 286a7053e6dSJason Wang void pvclock_set_flags(unsigned char flags) 287a7053e6dSJason Wang { 288a7053e6dSJason Wang valid_flags = flags; 289a7053e6dSJason Wang } 290