1 #include "libcflat.h" 2 #include "smp.h" 3 #include "atomic.h" 4 #include "processor.h" 5 #include "kvmclock.h" 6 #include "asm/barrier.h" 7 8 #define unlikely(x) __builtin_expect(!!(x), 0) 9 #define likely(x) __builtin_expect(!!(x), 1) 10 11 12 struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; 13 struct pvclock_wall_clock wall_clock; 14 static unsigned char valid_flags = 0; 15 static atomic64_t last_value = ATOMIC64_INIT(0); 16 17 /* 18 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 19 * yielding a 64-bit result. 20 */ 21 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) 22 { 23 u64 product; 24 #ifdef __i386__ 25 u32 tmp1, tmp2; 26 #endif 27 28 if (shift < 0) 29 delta >>= -shift; 30 else 31 delta <<= shift; 32 33 #ifdef __i386__ 34 __asm__ ( 35 "mul %5 ; " 36 "mov %4,%%eax ; " 37 "mov %%edx,%4 ; " 38 "mul %5 ; " 39 "xor %5,%5 ; " 40 "add %4,%%eax ; " 41 "adc %5,%%edx ; " 42 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 43 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 44 #elif defined(__x86_64__) 45 __asm__ ( 46 "mul %%rdx ; shrd $32,%%rdx,%%rax" 47 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 48 #else 49 #error implement me! 50 #endif 51 52 return product; 53 } 54 55 #ifdef __i386__ 56 # define do_div(n,base) ({ \ 57 u32 __base = (base); \ 58 u32 __rem; \ 59 __rem = ((u64)(n)) % __base; \ 60 (n) = ((u64)(n)) / __base; \ 61 __rem; \ 62 }) 63 #else 64 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base); 65 u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) 66 { 67 u64 rem = *n; 68 u64 b = base; 69 u64 res, d = 1; 70 u32 high = rem >> 32; 71 72 /* Reduce the thing a bit first */ 73 res = 0; 74 if (high >= base) { 75 high /= base; 76 res = (u64) high << 32; 77 rem -= (u64) (high*base) << 32; 78 } 79 80 while ((s64)b > 0 && b < rem) { 81 b = b+b; 82 d = d+d; 83 } 84 85 do { 86 if (rem >= b) { 87 rem -= b; 88 res += d; 89 } 90 b >>= 1; 91 d >>= 1; 92 } while (d); 93 94 *n = res; 95 return rem; 96 } 97 98 # define do_div(n,base) ({ \ 99 u32 __base = (base); \ 100 u32 __rem; \ 101 (void)(((typeof((n)) *)0) == ((u64 *)0)); \ 102 if (likely(((n) >> 32) == 0)) { \ 103 __rem = (u32)(n) % __base; \ 104 (n) = (u32)(n) / __base; \ 105 } else \ 106 __rem = __div64_32(&(n), __base); \ 107 __rem; \ 108 }) 109 #endif 110 111 /** 112 * set_normalized_timespec - set timespec sec and nsec parts and normalize 113 * 114 * @ts: pointer to timespec variable to be set 115 * @sec: seconds to set 116 * @nsec: nanoseconds to set 117 * 118 * Set seconds and nanoseconds field of a timespec variable and 119 * normalize to the timespec storage format 120 * 121 * Note: The tv_nsec part is always in the range of 122 * 0 <= tv_nsec < NSEC_PER_SEC 123 * For negative values only the tv_sec field is negative ! 124 */ 125 static void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) 126 { 127 while (nsec >= NSEC_PER_SEC) { 128 /* 129 * The following asm() prevents the compiler from 130 * optimising this loop into a modulo operation. See 131 * also __iter_div_u64_rem() in include/linux/time.h 132 */ 133 asm("" : "+rm"(nsec)); 134 nsec -= NSEC_PER_SEC; 135 ++sec; 136 } 137 while (nsec < 0) { 138 asm("" : "+rm"(nsec)); 139 nsec += NSEC_PER_SEC; 140 --sec; 141 } 142 ts->tv_sec = sec; 143 ts->tv_nsec = nsec; 144 } 145 146 static inline 147 unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) 148 { 149 unsigned version = src->version & ~1; 150 /* Make sure that the version is read before the data. */ 151 smp_rmb(); 152 return version; 153 } 154 155 static inline 156 bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, 157 unsigned version) 158 { 159 /* Make sure that the version is re-read after the data. */ 160 smp_rmb(); 161 return version != src->version; 162 } 163 164 static inline u64 rdtsc_ordered(void) 165 { 166 /* 167 * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up 168 * to 2x speedup 169 */ 170 mb(); 171 return rdtsc(); 172 } 173 174 static inline 175 cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) 176 { 177 u64 delta = rdtsc_ordered() - src->tsc_timestamp; 178 cycle_t offset = scale_delta(delta, src->tsc_to_system_mul, 179 src->tsc_shift); 180 return src->system_time + offset; 181 } 182 183 static cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 184 { 185 unsigned version; 186 cycle_t ret; 187 u64 last; 188 u8 flags; 189 190 do { 191 version = pvclock_read_begin(src); 192 ret = __pvclock_read_cycles(src); 193 flags = src->flags; 194 } while (pvclock_read_retry(src, version)); 195 196 if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || 197 ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 198 (flags & PVCLOCK_TSC_STABLE_BIT))) 199 return ret; 200 201 /* 202 * Assumption here is that last_value, a global accumulator, always goes 203 * forward. If we are less than that, we should not be much smaller. 204 * We assume there is an error margin we're inside, and then the 205 * correction does not sacrifice accuracy. 206 * 207 * For reads: global may have changed between test and return, 208 * but this means someone else updated poked the clock at a later time. 209 * We just need to make sure we are not seeing a backwards event. 210 * 211 * For updates: last_value = ret is not enough, since two vcpus could be 212 * updating at the same time, and one of them could be slightly behind, 213 * making the assumption that last_value always go forward fail to hold. 214 */ 215 last = atomic64_read(&last_value); 216 do { 217 if (ret < last) 218 return last; 219 last = atomic64_cmpxchg(&last_value, last, ret); 220 } while (unlikely(last != ret)); 221 222 return ret; 223 } 224 225 cycle_t kvm_clock_read(void) 226 { 227 struct pvclock_vcpu_time_info *src; 228 cycle_t ret; 229 int index = smp_id(); 230 231 src = &hv_clock[index]; 232 ret = pvclock_clocksource_read(src); 233 return ret; 234 } 235 236 void kvm_clock_init(void *data) 237 { 238 int index = smp_id(); 239 struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; 240 241 printf("kvm-clock: cpu %d, msr %p\n", index, hvc); 242 wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1); 243 } 244 245 void kvm_clock_clear(void *data) 246 { 247 wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL); 248 } 249 250 static void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, 251 struct pvclock_vcpu_time_info *vcpu_time, 252 struct timespec *ts) 253 { 254 u32 version; 255 u64 delta; 256 struct timespec now; 257 258 /* get wallclock at system boot */ 259 do { 260 version = wall_clock->version; 261 rmb(); /* fetch version before time */ 262 now.tv_sec = wall_clock->sec; 263 now.tv_nsec = wall_clock->nsec; 264 rmb(); /* fetch time before checking version */ 265 } while ((wall_clock->version & 1) || (version != wall_clock->version)); 266 267 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ 268 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 269 270 now.tv_nsec = do_div(delta, NSEC_PER_SEC); 271 now.tv_sec = delta; 272 273 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 274 } 275 276 void kvm_get_wallclock(struct timespec *ts) 277 { 278 struct pvclock_vcpu_time_info *vcpu_time; 279 int index = smp_id(); 280 281 wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock); 282 vcpu_time = &hv_clock[index]; 283 pvclock_read_wallclock(&wall_clock, vcpu_time, ts); 284 } 285 286 void pvclock_set_flags(unsigned char flags) 287 { 288 valid_flags = flags; 289 } 290