1*a7053e6dSJason Wang #include "libcflat.h" 2*a7053e6dSJason Wang #include "smp.h" 3*a7053e6dSJason Wang #include "atomic.h" 4*a7053e6dSJason Wang #include "processor.h" 5*a7053e6dSJason Wang #include "kvmclock.h" 6*a7053e6dSJason Wang 7*a7053e6dSJason Wang #define unlikely(x) __builtin_expect(!!(x), 0) 8*a7053e6dSJason Wang #define likely(x) __builtin_expect(!!(x), 1) 9*a7053e6dSJason Wang 10*a7053e6dSJason Wang 11*a7053e6dSJason Wang struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; 12*a7053e6dSJason Wang struct pvclock_wall_clock wall_clock; 13*a7053e6dSJason Wang static unsigned char valid_flags = 0; 14*a7053e6dSJason Wang static atomic64_t last_value = ATOMIC64_INIT(0); 15*a7053e6dSJason Wang 16*a7053e6dSJason Wang /* 17*a7053e6dSJason Wang * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 18*a7053e6dSJason Wang * yielding a 64-bit result. 19*a7053e6dSJason Wang */ 20*a7053e6dSJason Wang static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) 21*a7053e6dSJason Wang { 22*a7053e6dSJason Wang u64 product; 23*a7053e6dSJason Wang #ifdef __i386__ 24*a7053e6dSJason Wang u32 tmp1, tmp2; 25*a7053e6dSJason Wang #endif 26*a7053e6dSJason Wang 27*a7053e6dSJason Wang if (shift < 0) 28*a7053e6dSJason Wang delta >>= -shift; 29*a7053e6dSJason Wang else 30*a7053e6dSJason Wang delta <<= shift; 31*a7053e6dSJason Wang 32*a7053e6dSJason Wang #ifdef __i386__ 33*a7053e6dSJason Wang __asm__ ( 34*a7053e6dSJason Wang "mul %5 ; " 35*a7053e6dSJason Wang "mov %4,%%eax ; " 36*a7053e6dSJason Wang "mov %%edx,%4 ; " 37*a7053e6dSJason Wang "mul %5 ; " 38*a7053e6dSJason Wang "xor %5,%5 ; " 39*a7053e6dSJason Wang "add %4,%%eax ; " 40*a7053e6dSJason Wang "adc %5,%%edx ; " 41*a7053e6dSJason Wang : "=A" (product), "=r" (tmp1), "=r" (tmp2) 42*a7053e6dSJason Wang : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 43*a7053e6dSJason Wang #elif defined(__x86_64__) 44*a7053e6dSJason Wang __asm__ ( 45*a7053e6dSJason Wang "mul %%rdx ; shrd $32,%%rdx,%%rax" 46*a7053e6dSJason Wang : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 47*a7053e6dSJason Wang #else 48*a7053e6dSJason Wang #error implement me! 49*a7053e6dSJason Wang #endif 50*a7053e6dSJason Wang 51*a7053e6dSJason Wang return product; 52*a7053e6dSJason Wang } 53*a7053e6dSJason Wang 54*a7053e6dSJason Wang #ifdef __i386__ 55*a7053e6dSJason Wang # define do_div(n,base) ({ \ 56*a7053e6dSJason Wang u32 __base = (base); \ 57*a7053e6dSJason Wang u32 __rem; \ 58*a7053e6dSJason Wang __rem = ((u64)(n)) % __base; \ 59*a7053e6dSJason Wang (n) = ((u64)(n)) / __base; \ 60*a7053e6dSJason Wang __rem; \ 61*a7053e6dSJason Wang }) 62*a7053e6dSJason Wang #else 63*a7053e6dSJason Wang u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) 64*a7053e6dSJason Wang { 65*a7053e6dSJason Wang u64 rem = *n; 66*a7053e6dSJason Wang u64 b = base; 67*a7053e6dSJason Wang u64 res, d = 1; 68*a7053e6dSJason Wang u32 high = rem >> 32; 69*a7053e6dSJason Wang 70*a7053e6dSJason Wang /* Reduce the thing a bit first */ 71*a7053e6dSJason Wang res = 0; 72*a7053e6dSJason Wang if (high >= base) { 73*a7053e6dSJason Wang high /= base; 74*a7053e6dSJason Wang res = (u64) high << 32; 75*a7053e6dSJason Wang rem -= (u64) (high*base) << 32; 76*a7053e6dSJason Wang } 77*a7053e6dSJason Wang 78*a7053e6dSJason Wang while ((s64)b > 0 && b < rem) { 79*a7053e6dSJason Wang b = b+b; 80*a7053e6dSJason Wang d = d+d; 81*a7053e6dSJason Wang } 82*a7053e6dSJason Wang 83*a7053e6dSJason Wang do { 84*a7053e6dSJason Wang if (rem >= b) { 85*a7053e6dSJason Wang rem -= b; 86*a7053e6dSJason Wang res += d; 87*a7053e6dSJason Wang } 88*a7053e6dSJason Wang b >>= 1; 89*a7053e6dSJason Wang d >>= 1; 90*a7053e6dSJason Wang } while (d); 91*a7053e6dSJason Wang 92*a7053e6dSJason Wang *n = res; 93*a7053e6dSJason Wang return rem; 94*a7053e6dSJason Wang } 95*a7053e6dSJason Wang 96*a7053e6dSJason Wang # define do_div(n,base) ({ \ 97*a7053e6dSJason Wang u32 __base = (base); \ 98*a7053e6dSJason Wang u32 __rem; \ 99*a7053e6dSJason Wang (void)(((typeof((n)) *)0) == ((u64 *)0)); \ 100*a7053e6dSJason Wang if (likely(((n) >> 32) == 0)) { \ 101*a7053e6dSJason Wang __rem = (u32)(n) % __base; \ 102*a7053e6dSJason Wang (n) = (u32)(n) / __base; \ 103*a7053e6dSJason Wang } else \ 104*a7053e6dSJason Wang __rem = __div64_32(&(n), __base); \ 105*a7053e6dSJason Wang __rem; \ 106*a7053e6dSJason Wang }) 107*a7053e6dSJason Wang #endif 108*a7053e6dSJason Wang 109*a7053e6dSJason Wang /** 110*a7053e6dSJason Wang * set_normalized_timespec - set timespec sec and nsec parts and normalize 111*a7053e6dSJason Wang * 112*a7053e6dSJason Wang * @ts: pointer to timespec variable to be set 113*a7053e6dSJason Wang * @sec: seconds to set 114*a7053e6dSJason Wang * @nsec: nanoseconds to set 115*a7053e6dSJason Wang * 116*a7053e6dSJason Wang * Set seconds and nanoseconds field of a timespec variable and 117*a7053e6dSJason Wang * normalize to the timespec storage format 118*a7053e6dSJason Wang * 119*a7053e6dSJason Wang * Note: The tv_nsec part is always in the range of 120*a7053e6dSJason Wang * 0 <= tv_nsec < NSEC_PER_SEC 121*a7053e6dSJason Wang * For negative values only the tv_sec field is negative ! 122*a7053e6dSJason Wang */ 123*a7053e6dSJason Wang void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) 124*a7053e6dSJason Wang { 125*a7053e6dSJason Wang while (nsec >= NSEC_PER_SEC) { 126*a7053e6dSJason Wang /* 127*a7053e6dSJason Wang * The following asm() prevents the compiler from 128*a7053e6dSJason Wang * optimising this loop into a modulo operation. See 129*a7053e6dSJason Wang * also __iter_div_u64_rem() in include/linux/time.h 130*a7053e6dSJason Wang */ 131*a7053e6dSJason Wang asm("" : "+rm"(nsec)); 132*a7053e6dSJason Wang nsec -= NSEC_PER_SEC; 133*a7053e6dSJason Wang ++sec; 134*a7053e6dSJason Wang } 135*a7053e6dSJason Wang while (nsec < 0) { 136*a7053e6dSJason Wang asm("" : "+rm"(nsec)); 137*a7053e6dSJason Wang nsec += NSEC_PER_SEC; 138*a7053e6dSJason Wang --sec; 139*a7053e6dSJason Wang } 140*a7053e6dSJason Wang ts->tv_sec = sec; 141*a7053e6dSJason Wang ts->tv_nsec = nsec; 142*a7053e6dSJason Wang } 143*a7053e6dSJason Wang 144*a7053e6dSJason Wang static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 145*a7053e6dSJason Wang { 146*a7053e6dSJason Wang u64 delta = rdtsc() - shadow->tsc_timestamp; 147*a7053e6dSJason Wang return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 148*a7053e6dSJason Wang } 149*a7053e6dSJason Wang 150*a7053e6dSJason Wang /* 151*a7053e6dSJason Wang * Reads a consistent set of time-base values from hypervisor, 152*a7053e6dSJason Wang * into a shadow data area. 153*a7053e6dSJason Wang */ 154*a7053e6dSJason Wang static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, 155*a7053e6dSJason Wang struct pvclock_vcpu_time_info *src) 156*a7053e6dSJason Wang { 157*a7053e6dSJason Wang do { 158*a7053e6dSJason Wang dst->version = src->version; 159*a7053e6dSJason Wang rmb(); /* fetch version before data */ 160*a7053e6dSJason Wang dst->tsc_timestamp = src->tsc_timestamp; 161*a7053e6dSJason Wang dst->system_timestamp = src->system_time; 162*a7053e6dSJason Wang dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 163*a7053e6dSJason Wang dst->tsc_shift = src->tsc_shift; 164*a7053e6dSJason Wang dst->flags = src->flags; 165*a7053e6dSJason Wang rmb(); /* test version after fetching data */ 166*a7053e6dSJason Wang } while ((src->version & 1) || (dst->version != src->version)); 167*a7053e6dSJason Wang 168*a7053e6dSJason Wang return dst->version; 169*a7053e6dSJason Wang } 170*a7053e6dSJason Wang 171*a7053e6dSJason Wang cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 172*a7053e6dSJason Wang { 173*a7053e6dSJason Wang struct pvclock_shadow_time shadow; 174*a7053e6dSJason Wang unsigned version; 175*a7053e6dSJason Wang cycle_t ret, offset; 176*a7053e6dSJason Wang u64 last; 177*a7053e6dSJason Wang 178*a7053e6dSJason Wang do { 179*a7053e6dSJason Wang version = pvclock_get_time_values(&shadow, src); 180*a7053e6dSJason Wang barrier(); 181*a7053e6dSJason Wang offset = pvclock_get_nsec_offset(&shadow); 182*a7053e6dSJason Wang ret = shadow.system_timestamp + offset; 183*a7053e6dSJason Wang barrier(); 184*a7053e6dSJason Wang } while (version != src->version); 185*a7053e6dSJason Wang 186*a7053e6dSJason Wang if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || 187*a7053e6dSJason Wang ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 188*a7053e6dSJason Wang (shadow.flags & PVCLOCK_TSC_STABLE_BIT))) 189*a7053e6dSJason Wang return ret; 190*a7053e6dSJason Wang 191*a7053e6dSJason Wang /* 192*a7053e6dSJason Wang * Assumption here is that last_value, a global accumulator, always goes 193*a7053e6dSJason Wang * forward. If we are less than that, we should not be much smaller. 194*a7053e6dSJason Wang * We assume there is an error marging we're inside, and then the correction 195*a7053e6dSJason Wang * does not sacrifice accuracy. 196*a7053e6dSJason Wang * 197*a7053e6dSJason Wang * For reads: global may have changed between test and return, 198*a7053e6dSJason Wang * but this means someone else updated poked the clock at a later time. 199*a7053e6dSJason Wang * We just need to make sure we are not seeing a backwards event. 200*a7053e6dSJason Wang * 201*a7053e6dSJason Wang * For updates: last_value = ret is not enough, since two vcpus could be 202*a7053e6dSJason Wang * updating at the same time, and one of them could be slightly behind, 203*a7053e6dSJason Wang * making the assumption that last_value always go forward fail to hold. 204*a7053e6dSJason Wang */ 205*a7053e6dSJason Wang last = atomic64_read(&last_value); 206*a7053e6dSJason Wang do { 207*a7053e6dSJason Wang if (ret < last) 208*a7053e6dSJason Wang return last; 209*a7053e6dSJason Wang last = atomic64_cmpxchg(&last_value, last, ret); 210*a7053e6dSJason Wang } while (unlikely(last != ret)); 211*a7053e6dSJason Wang 212*a7053e6dSJason Wang return ret; 213*a7053e6dSJason Wang } 214*a7053e6dSJason Wang 215*a7053e6dSJason Wang cycle_t kvm_clock_read() 216*a7053e6dSJason Wang { 217*a7053e6dSJason Wang struct pvclock_vcpu_time_info *src; 218*a7053e6dSJason Wang cycle_t ret; 219*a7053e6dSJason Wang int index = smp_id(); 220*a7053e6dSJason Wang 221*a7053e6dSJason Wang src = &hv_clock[index]; 222*a7053e6dSJason Wang ret = pvclock_clocksource_read(src); 223*a7053e6dSJason Wang return ret; 224*a7053e6dSJason Wang } 225*a7053e6dSJason Wang 226*a7053e6dSJason Wang void kvm_clock_init(void *data) 227*a7053e6dSJason Wang { 228*a7053e6dSJason Wang int index = smp_id(); 229*a7053e6dSJason Wang struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; 230*a7053e6dSJason Wang 231*a7053e6dSJason Wang printf("kvm-clock: cpu %d, msr 0x:%lx \n", index, hvc); 232*a7053e6dSJason Wang wrmsr(MSR_KVM_SYSTEM_TIME, (unsigned long)hvc | 1); 233*a7053e6dSJason Wang } 234*a7053e6dSJason Wang 235*a7053e6dSJason Wang void kvm_clock_clear(void *data) 236*a7053e6dSJason Wang { 237*a7053e6dSJason Wang wrmsr(MSR_KVM_SYSTEM_TIME, 0LL); 238*a7053e6dSJason Wang } 239*a7053e6dSJason Wang 240*a7053e6dSJason Wang void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, 241*a7053e6dSJason Wang struct pvclock_vcpu_time_info *vcpu_time, 242*a7053e6dSJason Wang struct timespec *ts) 243*a7053e6dSJason Wang { 244*a7053e6dSJason Wang u32 version; 245*a7053e6dSJason Wang u64 delta; 246*a7053e6dSJason Wang struct timespec now; 247*a7053e6dSJason Wang 248*a7053e6dSJason Wang /* get wallclock at system boot */ 249*a7053e6dSJason Wang do { 250*a7053e6dSJason Wang version = wall_clock->version; 251*a7053e6dSJason Wang rmb(); /* fetch version before time */ 252*a7053e6dSJason Wang now.tv_sec = wall_clock->sec; 253*a7053e6dSJason Wang now.tv_nsec = wall_clock->nsec; 254*a7053e6dSJason Wang rmb(); /* fetch time before checking version */ 255*a7053e6dSJason Wang } while ((wall_clock->version & 1) || (version != wall_clock->version)); 256*a7053e6dSJason Wang 257*a7053e6dSJason Wang delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ 258*a7053e6dSJason Wang delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 259*a7053e6dSJason Wang 260*a7053e6dSJason Wang now.tv_nsec = do_div(delta, NSEC_PER_SEC); 261*a7053e6dSJason Wang now.tv_sec = delta; 262*a7053e6dSJason Wang 263*a7053e6dSJason Wang set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 264*a7053e6dSJason Wang } 265*a7053e6dSJason Wang 266*a7053e6dSJason Wang void kvm_get_wallclock(struct timespec *ts) 267*a7053e6dSJason Wang { 268*a7053e6dSJason Wang struct pvclock_vcpu_time_info *vcpu_time; 269*a7053e6dSJason Wang int index = smp_id(); 270*a7053e6dSJason Wang 271*a7053e6dSJason Wang wrmsr(MSR_KVM_WALL_CLOCK, (unsigned long)&wall_clock); 272*a7053e6dSJason Wang vcpu_time = &hv_clock[index]; 273*a7053e6dSJason Wang pvclock_read_wallclock(&wall_clock, vcpu_time, ts); 274*a7053e6dSJason Wang } 275*a7053e6dSJason Wang 276*a7053e6dSJason Wang void pvclock_set_flags(unsigned char flags) 277*a7053e6dSJason Wang { 278*a7053e6dSJason Wang valid_flags = flags; 279*a7053e6dSJason Wang } 280