1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Kernel timekeeping code and accessor functions. Based on code from
4 * timer.c, moved in commit 8524070b7982.
5 */
6 #include <linux/audit.h>
7 #include <linux/clocksource.h>
8 #include <linux/compiler.h>
9 #include <linux/jiffies.h>
10 #include <linux/kobject.h>
11 #include <linux/module.h>
12 #include <linux/nmi.h>
13 #include <linux/pvclock_gtod.h>
14 #include <linux/random.h>
15 #include <linux/sched/clock.h>
16 #include <linux/sched/loadavg.h>
17 #include <linux/static_key.h>
18 #include <linux/stop_machine.h>
19 #include <linux/syscore_ops.h>
20 #include <linux/tick.h>
21 #include <linux/time.h>
22 #include <linux/timex.h>
23 #include <linux/timekeeper_internal.h>
24
25 #include <vdso/auxclock.h>
26
27 #include "tick-internal.h"
28 #include "timekeeping_internal.h"
29 #include "ntp_internal.h"
30
31 #define TK_CLEAR_NTP (1 << 0)
32 #define TK_CLOCK_WAS_SET (1 << 1)
33
34 #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
35
36 enum timekeeping_adv_mode {
37 /* Update timekeeper when a tick has passed */
38 TK_ADV_TICK,
39
40 /* Update timekeeper on a direct frequency change */
41 TK_ADV_FREQ
42 };
43
44 /*
45 * The most important data for readout fits into a single 64 byte
46 * cache line.
47 */
48 struct tk_data {
49 seqcount_raw_spinlock_t seq;
50 struct timekeeper timekeeper;
51 struct timekeeper shadow_timekeeper;
52 raw_spinlock_t lock;
53 } ____cacheline_aligned;
54
55 static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
56
57 /* The core timekeeper */
58 #define tk_core (timekeeper_data[TIMEKEEPER_CORE])
59
60 #ifdef CONFIG_POSIX_AUX_CLOCKS
tk_get_aux_ts64(unsigned int tkid,struct timespec64 * ts)61 static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
62 {
63 return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
64 }
65
tk_is_aux(const struct timekeeper * tk)66 static inline bool tk_is_aux(const struct timekeeper *tk)
67 {
68 return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
69 }
70 #else
tk_get_aux_ts64(unsigned int tkid,struct timespec64 * ts)71 static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
72 {
73 return false;
74 }
75
tk_is_aux(const struct timekeeper * tk)76 static inline bool tk_is_aux(const struct timekeeper *tk)
77 {
78 return false;
79 }
80 #endif
81
tk_update_aux_offs(struct timekeeper * tk,ktime_t offs)82 static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
83 {
84 tk->offs_aux = offs;
85 tk->monotonic_to_aux = ktime_to_timespec64(offs);
86 }
87
88 /* flag for if timekeeping is suspended */
89 int __read_mostly timekeeping_suspended;
90
91 /**
92 * struct tk_fast - NMI safe timekeeper
93 * @seq: Sequence counter for protecting updates. The lowest bit
94 * is the index for the tk_read_base array
95 * @base: tk_read_base array. Access is indexed by the lowest bit of
96 * @seq.
97 *
98 * See @update_fast_timekeeper() below.
99 */
100 struct tk_fast {
101 seqcount_latch_t seq;
102 struct tk_read_base base[2];
103 };
104
105 /* Suspend-time cycles value for halted fast timekeeper. */
106 static u64 cycles_at_suspend;
107
dummy_clock_read(struct clocksource * cs)108 static u64 dummy_clock_read(struct clocksource *cs)
109 {
110 if (timekeeping_suspended)
111 return cycles_at_suspend;
112 return local_clock();
113 }
114
115 static struct clocksource dummy_clock = {
116 .read = dummy_clock_read,
117 };
118
119 /*
120 * Boot time initialization which allows local_clock() to be utilized
121 * during early boot when clocksources are not available. local_clock()
122 * returns nanoseconds already so no conversion is required, hence mult=1
123 * and shift=0. When the first proper clocksource is installed then
124 * the fast time keepers are updated with the correct values.
125 */
126 #define FAST_TK_INIT \
127 { \
128 .clock = &dummy_clock, \
129 .mask = CLOCKSOURCE_MASK(64), \
130 .mult = 1, \
131 .shift = 0, \
132 }
133
134 static struct tk_fast tk_fast_mono ____cacheline_aligned = {
135 .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
136 .base[0] = FAST_TK_INIT,
137 .base[1] = FAST_TK_INIT,
138 };
139
140 static struct tk_fast tk_fast_raw ____cacheline_aligned = {
141 .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
142 .base[0] = FAST_TK_INIT,
143 .base[1] = FAST_TK_INIT,
144 };
145
146 #ifdef CONFIG_POSIX_AUX_CLOCKS
147 static __init void tk_aux_setup(void);
148 static void tk_aux_update_clocksource(void);
149 static void tk_aux_advance(void);
150 #else
tk_aux_setup(void)151 static inline void tk_aux_setup(void) { }
tk_aux_update_clocksource(void)152 static inline void tk_aux_update_clocksource(void) { }
tk_aux_advance(void)153 static inline void tk_aux_advance(void) { }
154 #endif
155
timekeeper_lock_irqsave(void)156 unsigned long timekeeper_lock_irqsave(void)
157 {
158 unsigned long flags;
159
160 raw_spin_lock_irqsave(&tk_core.lock, flags);
161 return flags;
162 }
163
timekeeper_unlock_irqrestore(unsigned long flags)164 void timekeeper_unlock_irqrestore(unsigned long flags)
165 {
166 raw_spin_unlock_irqrestore(&tk_core.lock, flags);
167 }
168
169 /*
170 * Multigrain timestamps require tracking the latest fine-grained timestamp
171 * that has been issued, and never returning a coarse-grained timestamp that is
172 * earlier than that value.
173 *
174 * mg_floor represents the latest fine-grained time that has been handed out as
175 * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
176 * converted to a realtime clock value on an as-needed basis.
177 *
178 * Maintaining mg_floor ensures the multigrain interfaces never issue a
179 * timestamp earlier than one that has been previously issued.
180 *
181 * The exception to this rule is when there is a backward realtime clock jump. If
182 * such an event occurs, a timestamp can appear to be earlier than a previous one.
183 */
184 static __cacheline_aligned_in_smp atomic64_t mg_floor;
185
tk_normalize_xtime(struct timekeeper * tk)186 static inline void tk_normalize_xtime(struct timekeeper *tk)
187 {
188 while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
189 tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
190 tk->xtime_sec++;
191 }
192 while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
193 tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
194 tk->raw_sec++;
195 }
196 }
197
tk_xtime(const struct timekeeper * tk)198 static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
199 {
200 struct timespec64 ts;
201
202 ts.tv_sec = tk->xtime_sec;
203 ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
204 return ts;
205 }
206
tk_xtime_coarse(const struct timekeeper * tk)207 static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
208 {
209 struct timespec64 ts;
210
211 ts.tv_sec = tk->xtime_sec;
212 ts.tv_nsec = tk->coarse_nsec;
213 return ts;
214 }
215
216 /*
217 * Update the nanoseconds part for the coarse time keepers. They can't rely
218 * on xtime_nsec because xtime_nsec could be adjusted by a small negative
219 * amount when the multiplication factor of the clock is adjusted, which
220 * could cause the coarse clocks to go slightly backwards. See
221 * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
222 * clockids which only is updated when the clock has been set or we have
223 * accumulated time.
224 */
tk_update_coarse_nsecs(struct timekeeper * tk)225 static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
226 {
227 tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
228 }
229
tk_set_xtime(struct timekeeper * tk,const struct timespec64 * ts)230 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
231 {
232 tk->xtime_sec = ts->tv_sec;
233 tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
234 tk_update_coarse_nsecs(tk);
235 }
236
tk_xtime_add(struct timekeeper * tk,const struct timespec64 * ts)237 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
238 {
239 tk->xtime_sec += ts->tv_sec;
240 tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
241 tk_normalize_xtime(tk);
242 tk_update_coarse_nsecs(tk);
243 }
244
tk_set_wall_to_mono(struct timekeeper * tk,struct timespec64 wtm)245 static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
246 {
247 struct timespec64 tmp;
248
249 /*
250 * Verify consistency of: offset_real = -wall_to_monotonic
251 * before modifying anything
252 */
253 set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
254 -tk->wall_to_monotonic.tv_nsec);
255 WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
256 tk->wall_to_monotonic = wtm;
257 set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
258 /* Paired with READ_ONCE() in ktime_mono_to_any() */
259 WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
260 WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
261 }
262
tk_update_sleep_time(struct timekeeper * tk,ktime_t delta)263 static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
264 {
265 /* Paired with READ_ONCE() in ktime_mono_to_any() */
266 WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
267 /*
268 * Timespec representation for VDSO update to avoid 64bit division
269 * on every update.
270 */
271 tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
272 }
273
274 #ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
275 #include <asm/clock_inlined.h>
276
277 static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);
278
279 /*
280 * tk_clock_read - atomic clocksource read() helper
281 *
282 * This helper is necessary to use in the read paths because, while the
283 * seqcount ensures we don't return a bad value while structures are updated,
284 * it doesn't protect from potential crashes. There is the possibility that
285 * the tkr's clocksource may change between the read reference, and the
286 * clock reference passed to the read function. This can cause crashes if
287 * the wrong clocksource is passed to the wrong read function.
288 * This isn't necessary to use when holding the tk_core.lock or doing
289 * a read of the fast-timekeeper tkrs (which is protected by its own locking
290 * and update logic).
291 */
tk_clock_read(const struct tk_read_base * tkr)292 static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
293 {
294 struct clocksource *clock = READ_ONCE(tkr->clock);
295
296 if (static_branch_likely(&clocksource_read_inlined))
297 return arch_inlined_clocksource_read(clock);
298
299 return clock->read(clock);
300 }
301
clocksource_disable_inline_read(void)302 static inline void clocksource_disable_inline_read(void)
303 {
304 static_branch_disable(&clocksource_read_inlined);
305 }
306
clocksource_enable_inline_read(void)307 static inline void clocksource_enable_inline_read(void)
308 {
309 static_branch_enable(&clocksource_read_inlined);
310 }
311 #else
tk_clock_read(const struct tk_read_base * tkr)312 static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
313 {
314 struct clocksource *clock = READ_ONCE(tkr->clock);
315
316 return clock->read(clock);
317 }
clocksource_disable_inline_read(void)318 static inline void clocksource_disable_inline_read(void) { }
clocksource_enable_inline_read(void)319 static inline void clocksource_enable_inline_read(void) { }
320 #endif
321
322 /**
323 * tk_setup_internals - Set up internals to use clocksource clock.
324 *
325 * @tk: The target timekeeper to setup.
326 * @clock: Pointer to clocksource.
327 *
328 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
329 * pair and interval request.
330 *
331 * Unless you're the timekeeping code, you should not be using this!
332 */
tk_setup_internals(struct timekeeper * tk,struct clocksource * clock)333 static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
334 {
335 u64 interval;
336 u64 tmp, ntpinterval;
337 struct clocksource *old_clock;
338
339 ++tk->cs_was_changed_seq;
340 old_clock = tk->tkr_mono.clock;
341 tk->tkr_mono.clock = clock;
342 tk->tkr_mono.mask = clock->mask;
343 tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
344
345 tk->tkr_raw.clock = clock;
346 tk->tkr_raw.mask = clock->mask;
347 tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
348
349 /* Do the ns -> cycle conversion first, using original mult */
350 tmp = NTP_INTERVAL_LENGTH;
351 tmp <<= clock->shift;
352 ntpinterval = tmp;
353 tmp += clock->mult/2;
354 do_div(tmp, clock->mult);
355 if (tmp == 0)
356 tmp = 1;
357
358 interval = (u64) tmp;
359 tk->cycle_interval = interval;
360
361 /* Go back from cycles -> shifted ns */
362 tk->xtime_interval = interval * clock->mult;
363 tk->xtime_remainder = ntpinterval - tk->xtime_interval;
364 tk->raw_interval = interval * clock->mult;
365
366 /* if changing clocks, convert xtime_nsec shift units */
367 if (old_clock) {
368 int shift_change = clock->shift - old_clock->shift;
369 if (shift_change < 0) {
370 tk->tkr_mono.xtime_nsec >>= -shift_change;
371 tk->tkr_raw.xtime_nsec >>= -shift_change;
372 } else {
373 tk->tkr_mono.xtime_nsec <<= shift_change;
374 tk->tkr_raw.xtime_nsec <<= shift_change;
375 }
376 }
377
378 tk->tkr_mono.shift = clock->shift;
379 tk->tkr_raw.shift = clock->shift;
380
381 tk->ntp_error = 0;
382 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
383 tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
384
385 /*
386 * The timekeeper keeps its own mult values for the currently
387 * active clocksource. These value will be adjusted via NTP
388 * to counteract clock drifting.
389 */
390 tk->tkr_mono.mult = clock->mult;
391 tk->tkr_raw.mult = clock->mult;
392 tk->ntp_err_mult = 0;
393 tk->skip_second_overflow = 0;
394
395 tk->cs_id = clock->id;
396
397 /* Coupled clockevent data */
398 if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
399 clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
400 /*
401 * Aim for an one hour maximum delta and use KHz to handle
402 * clocksources with a frequency above 4GHz correctly as
403 * the frequency argument of clocks_calc_mult_shift() is u32.
404 */
405 clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
406 NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
407 /*
408 * Initialize the conversion limit as the previous clocksource
409 * might have the same shift/mult pair so the quick check in
410 * tk_update_ns_to_cyc() fails to update it after a clocksource
411 * change leaving it effectivly zero.
412 */
413 tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
414 }
415 }
416
417 /* Timekeeper helper functions. */
delta_to_ns_safe(const struct tk_read_base * tkr,u64 delta)418 static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
419 {
420 return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
421 }
422
timekeeping_cycles_to_ns(const struct tk_read_base * tkr,u64 cycles)423 static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
424 {
425 /* Calculate the delta since the last update_wall_time() */
426 u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
427
428 /*
429 * This detects both negative motion and the case where the delta
430 * overflows the multiplication with tkr->mult.
431 */
432 if (unlikely(delta > tkr->clock->max_cycles)) {
433 /*
434 * Handle clocksource inconsistency between CPUs to prevent
435 * time from going backwards by checking for the MSB of the
436 * mask being set in the delta.
437 */
438 if (delta & ~(mask >> 1))
439 return tkr->xtime_nsec >> tkr->shift;
440
441 return delta_to_ns_safe(tkr, delta);
442 }
443
444 return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
445 }
446
timekeeping_get_ns(const struct tk_read_base * tkr)447 static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
448 {
449 return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
450 }
451
452 /**
453 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
454 * @tkr: Timekeeping readout base from which we take the update
455 * @tkf: Pointer to NMI safe timekeeper
456 *
457 * We want to use this from any context including NMI and tracing /
458 * instrumenting the timekeeping code itself.
459 *
460 * Employ the latch technique; see @write_seqcount_latch.
461 *
462 * So if a NMI hits the update of base[0] then it will use base[1]
463 * which is still consistent. In the worst case this can result is a
464 * slightly wrong timestamp (a few nanoseconds). See
465 * @ktime_get_mono_fast_ns.
466 */
update_fast_timekeeper(const struct tk_read_base * tkr,struct tk_fast * tkf)467 static void update_fast_timekeeper(const struct tk_read_base *tkr,
468 struct tk_fast *tkf)
469 {
470 struct tk_read_base *base = tkf->base;
471
472 /* Force readers off to base[1] */
473 write_seqcount_latch_begin(&tkf->seq);
474
475 /* Update base[0] */
476 memcpy(base, tkr, sizeof(*base));
477
478 /* Force readers back to base[0] */
479 write_seqcount_latch(&tkf->seq);
480
481 /* Update base[1] */
482 memcpy(base + 1, base, sizeof(*base));
483
484 write_seqcount_latch_end(&tkf->seq);
485 }
486
__ktime_get_fast_ns(struct tk_fast * tkf)487 static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
488 {
489 struct tk_read_base *tkr;
490 unsigned int seq;
491 u64 now;
492
493 do {
494 seq = read_seqcount_latch(&tkf->seq);
495 tkr = tkf->base + (seq & 0x01);
496 now = ktime_to_ns(tkr->base);
497 now += timekeeping_get_ns(tkr);
498 } while (read_seqcount_latch_retry(&tkf->seq, seq));
499
500 return now;
501 }
502
503 /**
504 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
505 *
506 * This timestamp is not guaranteed to be monotonic across an update.
507 * The timestamp is calculated by:
508 *
509 * now = base_mono + clock_delta * slope
510 *
511 * So if the update lowers the slope, readers who are forced to the
512 * not yet updated second array are still using the old steeper slope.
513 *
514 * tmono
515 * ^
516 * | o n
517 * | o n
518 * | u
519 * | o
520 * |o
521 * |12345678---> reader order
522 *
523 * o = old slope
524 * u = update
525 * n = new slope
526 *
527 * So reader 6 will observe time going backwards versus reader 5.
528 *
529 * While other CPUs are likely to be able to observe that, the only way
530 * for a CPU local observation is when an NMI hits in the middle of
531 * the update. Timestamps taken from that NMI context might be ahead
532 * of the following timestamps. Callers need to be aware of that and
533 * deal with it.
534 */
ktime_get_mono_fast_ns(void)535 u64 notrace ktime_get_mono_fast_ns(void)
536 {
537 return __ktime_get_fast_ns(&tk_fast_mono);
538 }
539 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
540
541 /**
542 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
543 *
544 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
545 * conversion factor is not affected by NTP/PTP correction.
546 */
ktime_get_raw_fast_ns(void)547 u64 notrace ktime_get_raw_fast_ns(void)
548 {
549 return __ktime_get_fast_ns(&tk_fast_raw);
550 }
551 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
552
553 /**
554 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
555 *
556 * To keep it NMI safe since we're accessing from tracing, we're not using a
557 * separate timekeeper with updates to monotonic clock and boot offset
558 * protected with seqcounts. This has the following minor side effects:
559 *
560 * (1) Its possible that a timestamp be taken after the boot offset is updated
561 * but before the timekeeper is updated. If this happens, the new boot offset
562 * is added to the old timekeeping making the clock appear to update slightly
563 * earlier:
564 * CPU 0 CPU 1
565 * timekeeping_inject_sleeptime64()
566 * __timekeeping_inject_sleeptime(tk, delta);
567 * timestamp();
568 * timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
569 *
570 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
571 * partially updated. Since the tk->offs_boot update is a rare event, this
572 * should be a rare occurrence which postprocessing should be able to handle.
573 *
574 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
575 * apply as well.
576 */
ktime_get_boot_fast_ns(void)577 u64 notrace ktime_get_boot_fast_ns(void)
578 {
579 struct timekeeper *tk = &tk_core.timekeeper;
580
581 return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
582 }
583 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
584
585 /**
586 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
587 *
588 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
589 * mono time and the TAI offset are not read atomically which may yield wrong
590 * readouts. However, an update of the TAI offset is an rare event e.g., caused
591 * by settime or adjtimex with an offset. The user of this function has to deal
592 * with the possibility of wrong timestamps in post processing.
593 */
ktime_get_tai_fast_ns(void)594 u64 notrace ktime_get_tai_fast_ns(void)
595 {
596 struct timekeeper *tk = &tk_core.timekeeper;
597
598 return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
599 }
600 EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
601
602 /**
603 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
604 *
605 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
606 */
ktime_get_real_fast_ns(void)607 u64 ktime_get_real_fast_ns(void)
608 {
609 struct tk_fast *tkf = &tk_fast_mono;
610 struct tk_read_base *tkr;
611 u64 baser, delta;
612 unsigned int seq;
613
614 do {
615 seq = raw_read_seqcount_latch(&tkf->seq);
616 tkr = tkf->base + (seq & 0x01);
617 baser = ktime_to_ns(tkr->base_real);
618 delta = timekeeping_get_ns(tkr);
619 } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
620
621 return baser + delta;
622 }
623 EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
624
625 /**
626 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
627 * @tk: Timekeeper to snapshot.
628 *
629 * It generally is unsafe to access the clocksource after timekeeping has been
630 * suspended, so take a snapshot of the readout base of @tk and use it as the
631 * fast timekeeper's readout base while suspended. It will return the same
632 * number of cycles every time until timekeeping is resumed at which time the
633 * proper readout base for the fast timekeeper will be restored automatically.
634 */
halt_fast_timekeeper(const struct timekeeper * tk)635 static void halt_fast_timekeeper(const struct timekeeper *tk)
636 {
637 static struct tk_read_base tkr_dummy;
638 const struct tk_read_base *tkr = &tk->tkr_mono;
639
640 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
641 cycles_at_suspend = tk_clock_read(tkr);
642 tkr_dummy.clock = &dummy_clock;
643 tkr_dummy.base_real = tkr->base + tk->offs_real;
644 update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
645
646 tkr = &tk->tkr_raw;
647 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
648 tkr_dummy.clock = &dummy_clock;
649 update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
650 }
651
652 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
653
update_pvclock_gtod(struct timekeeper * tk,bool was_set)654 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
655 {
656 raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
657 }
658
659 /**
660 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
661 * @nb: Pointer to the notifier block to register
662 */
pvclock_gtod_register_notifier(struct notifier_block * nb)663 int pvclock_gtod_register_notifier(struct notifier_block *nb)
664 {
665 struct timekeeper *tk = &tk_core.timekeeper;
666 int ret;
667
668 guard(raw_spinlock_irqsave)(&tk_core.lock);
669 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
670 update_pvclock_gtod(tk, true);
671
672 return ret;
673 }
674 EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
675
676 /**
677 * pvclock_gtod_unregister_notifier - unregister a pvclock
678 * timedata update listener
679 * @nb: Pointer to the notifier block to unregister
680 */
pvclock_gtod_unregister_notifier(struct notifier_block * nb)681 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
682 {
683 guard(raw_spinlock_irqsave)(&tk_core.lock);
684 return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
685 }
686 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
687
688 /*
689 * tk_update_leap_state - helper to update the next_leap_ktime
690 */
tk_update_leap_state(struct timekeeper * tk)691 static inline void tk_update_leap_state(struct timekeeper *tk)
692 {
693 tk->next_leap_ktime = ntp_get_next_leap(tk->id);
694 if (tk->next_leap_ktime != KTIME_MAX)
695 /* Convert to monotonic time */
696 tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
697 }
698
699 /*
700 * Leap state update for both shadow and the real timekeeper
701 * Separate to spare a full memcpy() of the timekeeper.
702 */
tk_update_leap_state_all(struct tk_data * tkd)703 static void tk_update_leap_state_all(struct tk_data *tkd)
704 {
705 write_seqcount_begin(&tkd->seq);
706 tk_update_leap_state(&tkd->shadow_timekeeper);
707 tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
708 write_seqcount_end(&tkd->seq);
709 }
710
711 /*
712 * Update the ktime_t based scalar nsec members of the timekeeper
713 */
tk_update_ktime_data(struct timekeeper * tk)714 static inline void tk_update_ktime_data(struct timekeeper *tk)
715 {
716 u64 seconds;
717 u32 nsec;
718
719 /*
720 * The xtime based monotonic readout is:
721 * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
722 * The ktime based monotonic readout is:
723 * nsec = base_mono + now();
724 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
725 */
726 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
727 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
728 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
729
730 /*
731 * The sum of the nanoseconds portions of xtime and
732 * wall_to_monotonic can be greater/equal one second. Take
733 * this into account before updating tk->ktime_sec.
734 */
735 nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
736 if (nsec >= NSEC_PER_SEC)
737 seconds++;
738 tk->ktime_sec = seconds;
739
740 /* Update the monotonic raw base */
741 tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
742 }
743
tk_update_ns_to_cyc(struct timekeeper * tks,struct timekeeper * tkc)744 static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
745 {
746 struct tk_read_base *tkrs = &tks->tkr_mono;
747 struct tk_read_base *tkrc = &tkc->tkr_mono;
748 unsigned int shift;
749
750 if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
751 !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
752 return;
753
754 if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
755 return;
756 /*
757 * The conversion math is simple:
758 *
759 * CS::MULT (1 << NS_TO_CYC_SHIFT)
760 * --------------- = ----------------------
761 * (1 << CS:SHIFT) NS_TO_CYC_MULT
762 *
763 * Ergo:
764 *
765 * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
766 *
767 * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
768 */
769 shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
770 tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
771 tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
772 }
773
774 /*
775 * Restore the shadow timekeeper from the real timekeeper.
776 */
timekeeping_restore_shadow(struct tk_data * tkd)777 static void timekeeping_restore_shadow(struct tk_data *tkd)
778 {
779 lockdep_assert_held(&tkd->lock);
780 memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
781 }
782
timekeeping_update_from_shadow(struct tk_data * tkd,unsigned int action)783 static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
784 {
785 struct timekeeper *tk = &tkd->shadow_timekeeper;
786
787 lockdep_assert_held(&tkd->lock);
788
789 /*
790 * Block out readers before running the updates below because that
791 * updates VDSO and other time related infrastructure. Not blocking
792 * the readers might let a reader see time going backwards when
793 * reading from the VDSO after the VDSO update and then reading in
794 * the kernel from the timekeeper before that got updated.
795 */
796 write_seqcount_begin(&tkd->seq);
797
798 if (action & TK_CLEAR_NTP) {
799 tk->ntp_error = 0;
800 ntp_clear(tk->id);
801 }
802
803 tk_update_leap_state(tk);
804 tk_update_ktime_data(tk);
805 tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
806
807 if (tk->id == TIMEKEEPER_CORE) {
808 tk_update_ns_to_cyc(tk, &tkd->timekeeper);
809 update_vsyscall(tk);
810 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
811
812 update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
813 update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
814 } else if (tk_is_aux(tk)) {
815 vdso_time_update_aux(tk);
816 }
817
818 if (action & TK_CLOCK_WAS_SET)
819 tk->clock_was_set_seq++;
820
821 /*
822 * Update the real timekeeper.
823 *
824 * We could avoid this memcpy() by switching pointers, but that has
825 * the downside that the reader side does not longer benefit from
826 * the cacheline optimized data layout of the timekeeper and requires
827 * another indirection.
828 */
829 memcpy(&tkd->timekeeper, tk, sizeof(*tk));
830 write_seqcount_end(&tkd->seq);
831 }
832
833 /**
834 * timekeeping_forward_now - update clock to the current time
835 * @tk: Pointer to the timekeeper to update
836 *
837 * Forward the current clock to update its state since the last call to
838 * update_wall_time(). This is useful before significant clock changes,
839 * as it avoids having to deal with this time offset explicitly.
840 */
timekeeping_forward_now(struct timekeeper * tk)841 static void timekeeping_forward_now(struct timekeeper *tk)
842 {
843 u64 cycle_now, delta;
844
845 cycle_now = tk_clock_read(&tk->tkr_mono);
846 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
847 tk->tkr_mono.clock->max_raw_delta);
848 tk->tkr_mono.cycle_last = cycle_now;
849 tk->tkr_raw.cycle_last = cycle_now;
850
851 while (delta > 0) {
852 u64 max = tk->tkr_mono.clock->max_cycles;
853 u64 incr = delta < max ? delta : max;
854
855 tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
856 tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
857 tk_normalize_xtime(tk);
858 delta -= incr;
859 }
860 tk_update_coarse_nsecs(tk);
861 }
862
863 /*
864 * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
865 * @id: Clocksource ID which is required for validity
866 * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
867 * @cycles: Pointer to storage for corresponding absolute cycles value
868 *
869 * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
870 * based on the correlated clocksource of the clockevent device by using
871 * the base nanoseconds and cycles values of the last timekeeper update and
872 * converting the delta between @expires_ns and base nanoseconds to cycles.
873 *
874 * This only works for clockevent devices which are using a less than or
875 * equal comparator against the clocksource.
876 *
877 * Utilizing this avoids two clocksource reads for such devices, the
878 * ktime_get() in clockevents_program_event() to calculate the delta expiry
879 * value and the readout in the device::set_next_event() callback to
880 * convert the delta back to a absolute comparator value.
881 *
882 * Returns: True if @id matches the current clocksource ID, false otherwise
883 */
ktime_expiry_to_cycles(enum clocksource_ids id,ktime_t expires_ns,u64 * cycles)884 bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
885 {
886 struct timekeeper *tk = &tk_core.timekeeper;
887 struct tk_read_base *tkrm = &tk->tkr_mono;
888 ktime_t base_ns, delta_ns, max_ns;
889 u64 base_cycles, delta_cycles;
890 unsigned int seq;
891 u32 mult, shift;
892
893 /*
894 * Racy check to avoid the seqcount overhead when ID does not match. If
895 * the relevant clocksource is installed concurrently, then this will
896 * just delay the switch over to this mechanism until the next event is
897 * programmed. If the ID is not matching the clock events code will use
898 * the regular relative set_next_event() callback as before.
899 */
900 if (data_race(tk->cs_id) != id)
901 return false;
902
903 do {
904 seq = read_seqcount_begin(&tk_core.seq);
905
906 if (tk->cs_id != id)
907 return false;
908
909 base_cycles = tkrm->cycle_last;
910 base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);
911
912 mult = tk->cs_ns_to_cyc_mult;
913 shift = tk->cs_ns_to_cyc_shift;
914 max_ns = tk->cs_ns_to_cyc_maxns;
915
916 } while (read_seqcount_retry(&tk_core.seq, seq));
917
918 /* Prevent negative deltas and multiplication overflows */
919 delta_ns = min(expires_ns - base_ns, max_ns);
920 delta_ns = max(delta_ns, 0);
921
922 /* Convert to cycles */
923 delta_cycles = ((u64)delta_ns * mult) >> shift;
924 *cycles = base_cycles + delta_cycles;
925 return true;
926 }
927
928 /**
929 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
930 * @ts: pointer to the timespec to be set
931 *
932 * Returns the time of day in a timespec64 (WARN if suspended).
933 */
ktime_get_real_ts64(struct timespec64 * ts)934 void ktime_get_real_ts64(struct timespec64 *ts)
935 {
936 struct timekeeper *tk = &tk_core.timekeeper;
937 unsigned int seq;
938 u64 nsecs;
939
940 WARN_ON(timekeeping_suspended);
941
942 do {
943 seq = read_seqcount_begin(&tk_core.seq);
944
945 ts->tv_sec = tk->xtime_sec;
946 nsecs = timekeeping_get_ns(&tk->tkr_mono);
947
948 } while (read_seqcount_retry(&tk_core.seq, seq));
949
950 ts->tv_nsec = 0;
951 timespec64_add_ns(ts, nsecs);
952 }
953 EXPORT_SYMBOL(ktime_get_real_ts64);
954
ktime_get(void)955 ktime_t ktime_get(void)
956 {
957 struct timekeeper *tk = &tk_core.timekeeper;
958 unsigned int seq;
959 ktime_t base;
960 u64 nsecs;
961
962 WARN_ON(timekeeping_suspended);
963
964 do {
965 seq = read_seqcount_begin(&tk_core.seq);
966 base = tk->tkr_mono.base;
967 nsecs = timekeeping_get_ns(&tk->tkr_mono);
968
969 } while (read_seqcount_retry(&tk_core.seq, seq));
970
971 return ktime_add_ns(base, nsecs);
972 }
973 EXPORT_SYMBOL_GPL(ktime_get);
974
ktime_get_resolution_ns(void)975 u32 ktime_get_resolution_ns(void)
976 {
977 struct timekeeper *tk = &tk_core.timekeeper;
978 unsigned int seq;
979 u32 nsecs;
980
981 WARN_ON(timekeeping_suspended);
982
983 do {
984 seq = read_seqcount_begin(&tk_core.seq);
985 nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
986 } while (read_seqcount_retry(&tk_core.seq, seq));
987
988 return nsecs;
989 }
990 EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
991
992 static const ktime_t *const offsets[TK_OFFS_MAX] = {
993 [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
994 [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
995 [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
996 };
997
ktime_get_with_offset(enum tk_offsets offs)998 ktime_t ktime_get_with_offset(enum tk_offsets offs)
999 {
1000 struct timekeeper *tk = &tk_core.timekeeper;
1001 const ktime_t *offset = offsets[offs];
1002 unsigned int seq;
1003 ktime_t base;
1004 u64 nsecs;
1005
1006 WARN_ON(timekeeping_suspended);
1007
1008 do {
1009 seq = read_seqcount_begin(&tk_core.seq);
1010 base = ktime_add(tk->tkr_mono.base, *offset);
1011 nsecs = timekeeping_get_ns(&tk->tkr_mono);
1012
1013 } while (read_seqcount_retry(&tk_core.seq, seq));
1014
1015 return ktime_add_ns(base, nsecs);
1016
1017 }
1018 EXPORT_SYMBOL_GPL(ktime_get_with_offset);
1019
ktime_get_coarse_with_offset(enum tk_offsets offs)1020 ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
1021 {
1022 struct timekeeper *tk = &tk_core.timekeeper;
1023 const ktime_t *offset = offsets[offs];
1024 unsigned int seq;
1025 ktime_t base;
1026 u64 nsecs;
1027
1028 WARN_ON(timekeeping_suspended);
1029
1030 do {
1031 seq = read_seqcount_begin(&tk_core.seq);
1032 base = ktime_add(tk->tkr_mono.base, *offset);
1033 nsecs = tk->coarse_nsec;
1034
1035 } while (read_seqcount_retry(&tk_core.seq, seq));
1036
1037 return ktime_add_ns(base, nsecs);
1038 }
1039 EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
1040
1041 /**
1042 * ktime_mono_to_any() - convert monotonic time to any other time
1043 * @tmono: time to convert.
1044 * @offs: which offset to use
1045 */
ktime_mono_to_any(ktime_t tmono,enum tk_offsets offs)1046 ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
1047 {
1048 const ktime_t *offset = offsets[offs];
1049 unsigned int seq;
1050 ktime_t tconv;
1051
1052 if (IS_ENABLED(CONFIG_64BIT)) {
1053 /*
1054 * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
1055 * tk_update_sleep_time().
1056 */
1057 return ktime_add(tmono, READ_ONCE(*offset));
1058 }
1059
1060 do {
1061 seq = read_seqcount_begin(&tk_core.seq);
1062 tconv = ktime_add(tmono, *offset);
1063 } while (read_seqcount_retry(&tk_core.seq, seq));
1064
1065 return tconv;
1066 }
1067 EXPORT_SYMBOL_GPL(ktime_mono_to_any);
1068
1069 /**
1070 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
1071 */
ktime_get_raw(void)1072 ktime_t ktime_get_raw(void)
1073 {
1074 struct timekeeper *tk = &tk_core.timekeeper;
1075 unsigned int seq;
1076 ktime_t base;
1077 u64 nsecs;
1078
1079 do {
1080 seq = read_seqcount_begin(&tk_core.seq);
1081 base = tk->tkr_raw.base;
1082 nsecs = timekeeping_get_ns(&tk->tkr_raw);
1083
1084 } while (read_seqcount_retry(&tk_core.seq, seq));
1085
1086 return ktime_add_ns(base, nsecs);
1087 }
1088 EXPORT_SYMBOL_GPL(ktime_get_raw);
1089
1090 /**
1091 * ktime_get_ts64 - get the monotonic clock in timespec64 format
1092 * @ts: pointer to timespec variable
1093 *
1094 * The function calculates the monotonic clock from the realtime
1095 * clock and the wall_to_monotonic offset and stores the result
1096 * in normalized timespec64 format in the variable pointed to by @ts.
1097 */
ktime_get_ts64(struct timespec64 * ts)1098 void ktime_get_ts64(struct timespec64 *ts)
1099 {
1100 struct timekeeper *tk = &tk_core.timekeeper;
1101 struct timespec64 tomono;
1102 unsigned int seq;
1103 u64 nsec;
1104
1105 WARN_ON(timekeeping_suspended);
1106
1107 do {
1108 seq = read_seqcount_begin(&tk_core.seq);
1109 ts->tv_sec = tk->xtime_sec;
1110 nsec = timekeeping_get_ns(&tk->tkr_mono);
1111 tomono = tk->wall_to_monotonic;
1112
1113 } while (read_seqcount_retry(&tk_core.seq, seq));
1114
1115 ts->tv_sec += tomono.tv_sec;
1116 ts->tv_nsec = 0;
1117 timespec64_add_ns(ts, nsec + tomono.tv_nsec);
1118 }
1119 EXPORT_SYMBOL_GPL(ktime_get_ts64);
1120
1121 /**
1122 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
1123 *
1124 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
1125 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
1126 * works on both 32 and 64 bit systems. On 32 bit systems the readout
1127 * covers ~136 years of uptime which should be enough to prevent
1128 * premature wrap arounds.
1129 */
ktime_get_seconds(void)1130 time64_t ktime_get_seconds(void)
1131 {
1132 struct timekeeper *tk = &tk_core.timekeeper;
1133
1134 WARN_ON(timekeeping_suspended);
1135 return tk->ktime_sec;
1136 }
1137 EXPORT_SYMBOL_GPL(ktime_get_seconds);
1138
1139 /**
1140 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
1141 *
1142 * Returns the wall clock seconds since 1970.
1143 *
1144 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
1145 * 32bit systems the access must be protected with the sequence
1146 * counter to provide "atomic" access to the 64bit tk->xtime_sec
1147 * value.
1148 */
ktime_get_real_seconds(void)1149 time64_t ktime_get_real_seconds(void)
1150 {
1151 struct timekeeper *tk = &tk_core.timekeeper;
1152 time64_t seconds;
1153 unsigned int seq;
1154
1155 if (IS_ENABLED(CONFIG_64BIT))
1156 return tk->xtime_sec;
1157
1158 do {
1159 seq = read_seqcount_begin(&tk_core.seq);
1160 seconds = tk->xtime_sec;
1161
1162 } while (read_seqcount_retry(&tk_core.seq, seq));
1163
1164 return seconds;
1165 }
1166 EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
1167
1168 /**
1169 * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
1170 *
1171 * The same as ktime_get_real_seconds() but without the sequence counter
1172 * protection. This function is used in restricted contexts like the x86 MCE
1173 * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
1174 * completed modification and only to be used for such critical contexts.
1175 *
1176 * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
1177 */
__ktime_get_real_seconds(void)1178 noinstr time64_t __ktime_get_real_seconds(void)
1179 {
1180 struct timekeeper *tk = &tk_core.timekeeper;
1181
1182 return tk->xtime_sec;
1183 }
1184
1185 /**
1186 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
1187 * @systime_snapshot: pointer to struct receiving the system time snapshot
1188 */
ktime_get_snapshot(struct system_time_snapshot * systime_snapshot)1189 void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
1190 {
1191 struct timekeeper *tk = &tk_core.timekeeper;
1192 unsigned int seq;
1193 ktime_t base_raw;
1194 ktime_t base_real;
1195 ktime_t base_boot;
1196 u64 nsec_raw;
1197 u64 nsec_real;
1198 u64 now;
1199
1200 WARN_ON_ONCE(timekeeping_suspended);
1201
1202 do {
1203 seq = read_seqcount_begin(&tk_core.seq);
1204 now = tk_clock_read(&tk->tkr_mono);
1205 systime_snapshot->cs_id = tk->tkr_mono.clock->id;
1206 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
1207 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
1208 base_real = ktime_add(tk->tkr_mono.base,
1209 tk_core.timekeeper.offs_real);
1210 base_boot = ktime_add(tk->tkr_mono.base,
1211 tk_core.timekeeper.offs_boot);
1212 base_raw = tk->tkr_raw.base;
1213 nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
1214 nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
1215 } while (read_seqcount_retry(&tk_core.seq, seq));
1216
1217 systime_snapshot->cycles = now;
1218 systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
1219 systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
1220 systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
1221 }
1222 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
1223
1224 /* Scale base by mult/div checking for overflow */
scale64_check_overflow(u64 mult,u64 div,u64 * base)1225 static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
1226 {
1227 u64 tmp, rem;
1228
1229 tmp = div64_u64_rem(*base, div, &rem);
1230
1231 if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
1232 ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
1233 return -EOVERFLOW;
1234 tmp *= mult;
1235
1236 rem = div64_u64(rem * mult, div);
1237 *base = tmp + rem;
1238 return 0;
1239 }
1240
1241 /**
1242 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
1243 * @history: Snapshot representing start of history
1244 * @partial_history_cycles: Cycle offset into history (fractional part)
1245 * @total_history_cycles: Total history length in cycles
1246 * @discontinuity: True indicates clock was set on history period
1247 * @ts: Cross timestamp that should be adjusted using
1248 * partial/total ratio
1249 *
1250 * Helper function used by get_device_system_crosststamp() to correct the
1251 * crosstimestamp corresponding to the start of the current interval to the
1252 * system counter value (timestamp point) provided by the driver. The
1253 * total_history_* quantities are the total history starting at the provided
1254 * reference point and ending at the start of the current interval. The cycle
1255 * count between the driver timestamp point and the start of the current
1256 * interval is partial_history_cycles.
1257 */
adjust_historical_crosststamp(struct system_time_snapshot * history,u64 partial_history_cycles,u64 total_history_cycles,bool discontinuity,struct system_device_crosststamp * ts)1258 static int adjust_historical_crosststamp(struct system_time_snapshot *history,
1259 u64 partial_history_cycles,
1260 u64 total_history_cycles,
1261 bool discontinuity,
1262 struct system_device_crosststamp *ts)
1263 {
1264 struct timekeeper *tk = &tk_core.timekeeper;
1265 u64 corr_raw, corr_real;
1266 bool interp_forward;
1267 int ret;
1268
1269 if (total_history_cycles == 0 || partial_history_cycles == 0)
1270 return 0;
1271
1272 /* Interpolate shortest distance from beginning or end of history */
1273 interp_forward = partial_history_cycles > total_history_cycles / 2;
1274 partial_history_cycles = interp_forward ?
1275 total_history_cycles - partial_history_cycles :
1276 partial_history_cycles;
1277
1278 /*
1279 * Scale the monotonic raw time delta by:
1280 * partial_history_cycles / total_history_cycles
1281 */
1282 corr_raw = (u64)ktime_to_ns(
1283 ktime_sub(ts->sys_monoraw, history->raw));
1284 ret = scale64_check_overflow(partial_history_cycles,
1285 total_history_cycles, &corr_raw);
1286 if (ret)
1287 return ret;
1288
1289 /*
1290 * If there is a discontinuity in the history, scale monotonic raw
1291 * correction by:
1292 * mult(real)/mult(raw) yielding the realtime correction
1293 * Otherwise, calculate the realtime correction similar to monotonic
1294 * raw calculation
1295 */
1296 if (discontinuity) {
1297 corr_real = mul_u64_u32_div
1298 (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
1299 } else {
1300 corr_real = (u64)ktime_to_ns(
1301 ktime_sub(ts->sys_realtime, history->real));
1302 ret = scale64_check_overflow(partial_history_cycles,
1303 total_history_cycles, &corr_real);
1304 if (ret)
1305 return ret;
1306 }
1307
1308 /* Fixup monotonic raw and real time time values */
1309 if (interp_forward) {
1310 ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
1311 ts->sys_realtime = ktime_add_ns(history->real, corr_real);
1312 } else {
1313 ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
1314 ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
1315 }
1316
1317 return 0;
1318 }
1319
1320 /*
1321 * timestamp_in_interval - true if ts is chronologically in [start, end]
1322 *
1323 * True if ts occurs chronologically at or after start, and before or at end.
1324 */
timestamp_in_interval(u64 start,u64 end,u64 ts)1325 static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
1326 {
1327 if (ts >= start && ts <= end)
1328 return true;
1329 if (start > end && (ts >= start || ts <= end))
1330 return true;
1331 return false;
1332 }
1333
convert_clock(u64 * val,u32 numerator,u32 denominator)1334 static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
1335 {
1336 u64 rem, res;
1337
1338 if (!numerator || !denominator)
1339 return false;
1340
1341 res = div64_u64_rem(*val, denominator, &rem) * numerator;
1342 *val = res + div_u64(rem * numerator, denominator);
1343 return true;
1344 }
1345
convert_base_to_cs(struct system_counterval_t * scv)1346 static bool convert_base_to_cs(struct system_counterval_t *scv)
1347 {
1348 struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
1349 struct clocksource_base *base;
1350 u32 num, den;
1351
1352 /* The timestamp was taken from the time keeper clock source */
1353 if (cs->id == scv->cs_id)
1354 return true;
1355
1356 /*
1357 * Check whether cs_id matches the base clock. Prevent the compiler from
1358 * re-evaluating @base as the clocksource might change concurrently.
1359 */
1360 base = READ_ONCE(cs->base);
1361 if (!base || base->id != scv->cs_id)
1362 return false;
1363
1364 num = scv->use_nsecs ? cs->freq_khz : base->numerator;
1365 den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;
1366
1367 if (!convert_clock(&scv->cycles, num, den))
1368 return false;
1369
1370 scv->cycles += base->offset;
1371 return true;
1372 }
1373
convert_cs_to_base(u64 * cycles,enum clocksource_ids base_id)1374 static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
1375 {
1376 struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
1377 struct clocksource_base *base;
1378
1379 /*
1380 * Check whether base_id matches the base clock. Prevent the compiler from
1381 * re-evaluating @base as the clocksource might change concurrently.
1382 */
1383 base = READ_ONCE(cs->base);
1384 if (!base || base->id != base_id)
1385 return false;
1386
1387 *cycles -= base->offset;
1388 if (!convert_clock(cycles, base->denominator, base->numerator))
1389 return false;
1390 return true;
1391 }
1392
convert_ns_to_cs(u64 * delta)1393 static bool convert_ns_to_cs(u64 *delta)
1394 {
1395 struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
1396
1397 if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
1398 return false;
1399
1400 *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
1401 return true;
1402 }
1403
1404 /**
1405 * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
1406 * @treal: CLOCK_REALTIME timestamp to convert
1407 * @base_id: base clocksource id
1408 * @cycles: pointer to store the converted base clock timestamp
1409 *
1410 * Converts a supplied, future realtime clock value to the corresponding base clock value.
1411 *
1412 * Return: true if the conversion is successful, false otherwise.
1413 */
ktime_real_to_base_clock(ktime_t treal,enum clocksource_ids base_id,u64 * cycles)1414 bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
1415 {
1416 struct timekeeper *tk = &tk_core.timekeeper;
1417 unsigned int seq;
1418 u64 delta;
1419
1420 do {
1421 seq = read_seqcount_begin(&tk_core.seq);
1422 if ((u64)treal < tk->tkr_mono.base_real)
1423 return false;
1424 delta = (u64)treal - tk->tkr_mono.base_real;
1425 if (!convert_ns_to_cs(&delta))
1426 return false;
1427 *cycles = tk->tkr_mono.cycle_last + delta;
1428 if (!convert_cs_to_base(cycles, base_id))
1429 return false;
1430 } while (read_seqcount_retry(&tk_core.seq, seq));
1431
1432 return true;
1433 }
1434 EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
1435
1436 /**
1437 * get_device_system_crosststamp - Synchronously capture system/device timestamp
1438 * @get_time_fn: Callback to get simultaneous device time and
1439 * system counter from the device driver
1440 * @ctx: Context passed to get_time_fn()
1441 * @history_begin: Historical reference point used to interpolate system
1442 * time when counter provided by the driver is before the current interval
1443 * @xtstamp: Receives simultaneously captured system and device time
1444 *
1445 * Reads a timestamp from a device and correlates it to system time
1446 */
get_device_system_crosststamp(int (* get_time_fn)(ktime_t * device_time,struct system_counterval_t * sys_counterval,void * ctx),void * ctx,struct system_time_snapshot * history_begin,struct system_device_crosststamp * xtstamp)1447 int get_device_system_crosststamp(int (*get_time_fn)
1448 (ktime_t *device_time,
1449 struct system_counterval_t *sys_counterval,
1450 void *ctx),
1451 void *ctx,
1452 struct system_time_snapshot *history_begin,
1453 struct system_device_crosststamp *xtstamp)
1454 {
1455 struct system_counterval_t system_counterval = {};
1456 struct timekeeper *tk = &tk_core.timekeeper;
1457 u64 cycles, now, interval_start;
1458 unsigned int clock_was_set_seq = 0;
1459 ktime_t base_real, base_raw;
1460 u64 nsec_real, nsec_raw;
1461 u8 cs_was_changed_seq;
1462 unsigned int seq;
1463 bool do_interp;
1464 int ret;
1465
1466 do {
1467 seq = read_seqcount_begin(&tk_core.seq);
1468 /*
1469 * Try to synchronously capture device time and a system
1470 * counter value calling back into the device driver
1471 */
1472 ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
1473 if (ret)
1474 return ret;
1475
1476 /*
1477 * Verify that the clocksource ID associated with the captured
1478 * system counter value is the same as for the currently
1479 * installed timekeeper clocksource
1480 */
1481 if (system_counterval.cs_id == CSID_GENERIC ||
1482 !convert_base_to_cs(&system_counterval))
1483 return -ENODEV;
1484 cycles = system_counterval.cycles;
1485
1486 /*
1487 * Check whether the system counter value provided by the
1488 * device driver is on the current timekeeping interval.
1489 */
1490 now = tk_clock_read(&tk->tkr_mono);
1491 interval_start = tk->tkr_mono.cycle_last;
1492 if (!timestamp_in_interval(interval_start, now, cycles)) {
1493 clock_was_set_seq = tk->clock_was_set_seq;
1494 cs_was_changed_seq = tk->cs_was_changed_seq;
1495 cycles = interval_start;
1496 do_interp = true;
1497 } else {
1498 do_interp = false;
1499 }
1500
1501 base_real = ktime_add(tk->tkr_mono.base,
1502 tk_core.timekeeper.offs_real);
1503 base_raw = tk->tkr_raw.base;
1504
1505 nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
1506 nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
1507 } while (read_seqcount_retry(&tk_core.seq, seq));
1508
1509 xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
1510 xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
1511
1512 /*
1513 * Interpolate if necessary, adjusting back from the start of the
1514 * current interval
1515 */
1516 if (do_interp) {
1517 u64 partial_history_cycles, total_history_cycles;
1518 bool discontinuity;
1519
1520 /*
1521 * Check that the counter value is not before the provided
1522 * history reference and that the history doesn't cross a
1523 * clocksource change
1524 */
1525 if (!history_begin ||
1526 !timestamp_in_interval(history_begin->cycles,
1527 cycles, system_counterval.cycles) ||
1528 history_begin->cs_was_changed_seq != cs_was_changed_seq)
1529 return -EINVAL;
1530 partial_history_cycles = cycles - system_counterval.cycles;
1531 total_history_cycles = cycles - history_begin->cycles;
1532 discontinuity =
1533 history_begin->clock_was_set_seq != clock_was_set_seq;
1534
1535 ret = adjust_historical_crosststamp(history_begin,
1536 partial_history_cycles,
1537 total_history_cycles,
1538 discontinuity, xtstamp);
1539 if (ret)
1540 return ret;
1541 }
1542
1543 return 0;
1544 }
1545 EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
1546
1547 /**
1548 * timekeeping_clocksource_has_base - Check whether the current clocksource
1549 * is based on given a base clock
1550 * @id: base clocksource ID
1551 *
1552 * Note: The return value is a snapshot which can become invalid right
1553 * after the function returns.
1554 *
1555 * Return: true if the timekeeper clocksource has a base clock with @id,
1556 * false otherwise
1557 */
timekeeping_clocksource_has_base(enum clocksource_ids id)1558 bool timekeeping_clocksource_has_base(enum clocksource_ids id)
1559 {
1560 /*
1561 * This is a snapshot, so no point in using the sequence
1562 * count. Just prevent the compiler from re-evaluating @base as the
1563 * clocksource might change concurrently.
1564 */
1565 struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);
1566
1567 return base ? base->id == id : false;
1568 }
1569 EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
1570
1571 /**
1572 * do_settimeofday64 - Sets the time of day.
1573 * @ts: pointer to the timespec64 variable containing the new time
1574 *
1575 * Sets the time of day to the new time and update NTP and notify hrtimers
1576 */
do_settimeofday64(const struct timespec64 * ts)1577 int do_settimeofday64(const struct timespec64 *ts)
1578 {
1579 struct timespec64 ts_delta, xt;
1580
1581 if (!timespec64_valid_settod(ts))
1582 return -EINVAL;
1583
1584 scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
1585 struct timekeeper *tks = &tk_core.shadow_timekeeper;
1586
1587 timekeeping_forward_now(tks);
1588
1589 xt = tk_xtime(tks);
1590 ts_delta = timespec64_sub(*ts, xt);
1591
1592 if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
1593 timekeeping_restore_shadow(&tk_core);
1594 return -EINVAL;
1595 }
1596
1597 tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
1598 tk_set_xtime(tks, ts);
1599 timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
1600 }
1601
1602 /* Signal hrtimers about time change */
1603 clock_was_set(CLOCK_SET_WALL);
1604
1605 audit_tk_injoffset(ts_delta);
1606 add_device_randomness(ts, sizeof(*ts));
1607 return 0;
1608 }
1609 EXPORT_SYMBOL(do_settimeofday64);
1610
timekeeper_is_core_tk(struct timekeeper * tk)1611 static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
1612 {
1613 return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
1614 }
1615
1616 /**
1617 * __timekeeping_inject_offset - Adds or subtracts from the current time.
1618 * @tkd: Pointer to the timekeeper to modify
1619 * @ts: Pointer to the timespec variable containing the offset
1620 *
1621 * Adds or subtracts an offset value from the current time.
1622 */
__timekeeping_inject_offset(struct tk_data * tkd,const struct timespec64 * ts)1623 static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
1624 {
1625 struct timekeeper *tks = &tkd->shadow_timekeeper;
1626 struct timespec64 tmp;
1627
1628 if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
1629 return -EINVAL;
1630
1631 timekeeping_forward_now(tks);
1632
1633 if (timekeeper_is_core_tk(tks)) {
1634 /* Make sure the proposed value is valid */
1635 tmp = timespec64_add(tk_xtime(tks), *ts);
1636 if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
1637 !timespec64_valid_settod(&tmp)) {
1638 timekeeping_restore_shadow(tkd);
1639 return -EINVAL;
1640 }
1641
1642 tk_xtime_add(tks, ts);
1643 tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
1644 } else {
1645 struct tk_read_base *tkr_mono = &tks->tkr_mono;
1646 ktime_t now, offs;
1647
1648 /* Get the current time */
1649 now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
1650 /* Add the relative offset change */
1651 offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
1652
1653 /* Prevent that the resulting time becomes negative */
1654 if (ktime_add(now, offs) < 0) {
1655 timekeeping_restore_shadow(tkd);
1656 return -EINVAL;
1657 }
1658 tk_update_aux_offs(tks, offs);
1659 }
1660
1661 timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
1662 return 0;
1663 }
1664
timekeeping_inject_offset(const struct timespec64 * ts)1665 static int timekeeping_inject_offset(const struct timespec64 *ts)
1666 {
1667 int ret;
1668
1669 scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
1670 ret = __timekeeping_inject_offset(&tk_core, ts);
1671
1672 /* Signal hrtimers about time change */
1673 if (!ret)
1674 clock_was_set(CLOCK_SET_WALL);
1675 return ret;
1676 }
1677
1678 /*
1679 * Indicates if there is an offset between the system clock and the hardware
1680 * clock/persistent clock/rtc.
1681 */
1682 int persistent_clock_is_local;
1683
1684 /*
1685 * Adjust the time obtained from the CMOS to be UTC time instead of
1686 * local time.
1687 *
1688 * This is ugly, but preferable to the alternatives. Otherwise we
1689 * would either need to write a program to do it in /etc/rc (and risk
1690 * confusion if the program gets run more than once; it would also be
1691 * hard to make the program warp the clock precisely n hours) or
1692 * compile in the timezone information into the kernel. Bad, bad....
1693 *
1694 * - TYT, 1992-01-01
1695 *
1696 * The best thing to do is to keep the CMOS clock in universal time (UTC)
1697 * as real UNIX machines always do it. This avoids all headaches about
1698 * daylight saving times and warping kernel clocks.
1699 */
timekeeping_warp_clock(void)1700 void timekeeping_warp_clock(void)
1701 {
1702 if (sys_tz.tz_minuteswest != 0) {
1703 struct timespec64 adjust;
1704
1705 persistent_clock_is_local = 1;
1706 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
1707 adjust.tv_nsec = 0;
1708 timekeeping_inject_offset(&adjust);
1709 }
1710 }
1711
1712 /*
1713 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
1714 */
__timekeeping_set_tai_offset(struct timekeeper * tk,s32 tai_offset)1715 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
1716 {
1717 tk->tai_offset = tai_offset;
1718 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
1719 }
1720
1721 /*
1722 * change_clocksource - Swaps clocksources if a new one is available
1723 *
1724 * Accumulates current time interval and initializes new clocksource
1725 */
change_clocksource(void * data)1726 static int change_clocksource(void *data)
1727 {
1728 struct clocksource *new = data, *old = NULL;
1729
1730 /*
1731 * If the clocksource is in a module, get a module reference.
1732 * Succeeds for built-in code (owner == NULL) as well. Abort if the
1733 * reference can't be acquired.
1734 */
1735 if (!try_module_get(new->owner))
1736 return 0;
1737
1738 /* Abort if the device can't be enabled */
1739 if (new->enable && new->enable(new) != 0) {
1740 module_put(new->owner);
1741 return 0;
1742 }
1743
1744 scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
1745 struct timekeeper *tks = &tk_core.shadow_timekeeper;
1746
1747 timekeeping_forward_now(tks);
1748 old = tks->tkr_mono.clock;
1749 tk_setup_internals(tks, new);
1750 timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
1751 }
1752
1753 tk_aux_update_clocksource();
1754
1755 if (old) {
1756 if (old->disable)
1757 old->disable(old);
1758 module_put(old->owner);
1759 }
1760
1761 return 0;
1762 }
1763
1764 /**
1765 * timekeeping_notify - Install a new clock source
1766 * @clock: pointer to the clock source
1767 *
1768 * This function is called from clocksource.c after a new, better clock
1769 * source has been registered. The caller holds the clocksource_mutex.
1770 */
timekeeping_notify(struct clocksource * clock)1771 int timekeeping_notify(struct clocksource *clock)
1772 {
1773 struct timekeeper *tk = &tk_core.timekeeper;
1774
1775 if (tk->tkr_mono.clock == clock)
1776 return 0;
1777
1778 /* Disable inlined reads accross the clocksource switch */
1779 clocksource_disable_inline_read();
1780
1781 stop_machine(change_clocksource, clock, NULL);
1782
1783 /*
1784 * If the clocksource has been selected and supports inlined reads
1785 * enable the branch.
1786 */
1787 if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
1788 clocksource_enable_inline_read();
1789
1790 tick_clock_notify();
1791 return tk->tkr_mono.clock == clock ? 0 : -1;
1792 }
1793
1794 /**
1795 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
1796 * @ts: pointer to the timespec64 to be set
1797 *
1798 * Returns the raw monotonic time (completely un-modified by ntp)
1799 */
ktime_get_raw_ts64(struct timespec64 * ts)1800 void ktime_get_raw_ts64(struct timespec64 *ts)
1801 {
1802 struct timekeeper *tk = &tk_core.timekeeper;
1803 unsigned int seq;
1804 u64 nsecs;
1805
1806 do {
1807 seq = read_seqcount_begin(&tk_core.seq);
1808 ts->tv_sec = tk->raw_sec;
1809 nsecs = timekeeping_get_ns(&tk->tkr_raw);
1810
1811 } while (read_seqcount_retry(&tk_core.seq, seq));
1812
1813 ts->tv_nsec = 0;
1814 timespec64_add_ns(ts, nsecs);
1815 }
1816 EXPORT_SYMBOL(ktime_get_raw_ts64);
1817
1818 /**
1819 * ktime_get_clock_ts64 - Returns time of a clock in a timespec
1820 * @id: POSIX clock ID of the clock to read
1821 * @ts: Pointer to the timespec64 to be set
1822 *
1823 * The timestamp is invalidated (@ts->sec is set to -1) if the
1824 * clock @id is not available.
1825 */
ktime_get_clock_ts64(clockid_t id,struct timespec64 * ts)1826 void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
1827 {
1828 /* Invalidate time stamp */
1829 ts->tv_sec = -1;
1830 ts->tv_nsec = 0;
1831
1832 switch (id) {
1833 case CLOCK_REALTIME:
1834 ktime_get_real_ts64(ts);
1835 return;
1836 case CLOCK_MONOTONIC:
1837 ktime_get_ts64(ts);
1838 return;
1839 case CLOCK_MONOTONIC_RAW:
1840 ktime_get_raw_ts64(ts);
1841 return;
1842 case CLOCK_AUX ... CLOCK_AUX_LAST:
1843 if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
1844 ktime_get_aux_ts64(id, ts);
1845 return;
1846 default:
1847 WARN_ON_ONCE(1);
1848 }
1849 }
1850 EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
1851
1852 /**
1853 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
1854 */
timekeeping_valid_for_hres(void)1855 int timekeeping_valid_for_hres(void)
1856 {
1857 struct timekeeper *tk = &tk_core.timekeeper;
1858 unsigned int seq;
1859 int ret;
1860
1861 do {
1862 seq = read_seqcount_begin(&tk_core.seq);
1863
1864 ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
1865
1866 } while (read_seqcount_retry(&tk_core.seq, seq));
1867
1868 return ret;
1869 }
1870
1871 /**
1872 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
1873 */
timekeeping_max_deferment(void)1874 u64 timekeeping_max_deferment(void)
1875 {
1876 struct timekeeper *tk = &tk_core.timekeeper;
1877 unsigned int seq;
1878 u64 ret;
1879
1880 do {
1881 seq = read_seqcount_begin(&tk_core.seq);
1882
1883 ret = tk->tkr_mono.clock->max_idle_ns;
1884
1885 } while (read_seqcount_retry(&tk_core.seq, seq));
1886
1887 return ret;
1888 }
1889
1890 /**
1891 * read_persistent_clock64 - Return time from the persistent clock.
1892 * @ts: Pointer to the storage for the readout value
1893 *
1894 * Weak dummy function for arches that do not yet support it.
1895 * Reads the time from the battery backed persistent clock.
1896 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
1897 *
1898 * XXX - Do be sure to remove it once all arches implement it.
1899 */
read_persistent_clock64(struct timespec64 * ts)1900 void __weak read_persistent_clock64(struct timespec64 *ts)
1901 {
1902 ts->tv_sec = 0;
1903 ts->tv_nsec = 0;
1904 }
1905
1906 /**
1907 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
1908 * from the boot.
1909 * @wall_time: current time as returned by persistent clock
1910 * @boot_offset: offset that is defined as wall_time - boot_time
1911 *
1912 * Weak dummy function for arches that do not yet support it.
1913 *
1914 * The default function calculates offset based on the current value of
1915 * local_clock(). This way architectures that support sched_clock() but don't
1916 * support dedicated boot time clock will provide the best estimate of the
1917 * boot time.
1918 */
1919 void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 * wall_time,struct timespec64 * boot_offset)1920 read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
1921 struct timespec64 *boot_offset)
1922 {
1923 read_persistent_clock64(wall_time);
1924 *boot_offset = ns_to_timespec64(local_clock());
1925 }
1926
tkd_basic_setup(struct tk_data * tkd,enum timekeeper_ids tk_id,bool valid)1927 static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
1928 {
1929 raw_spin_lock_init(&tkd->lock);
1930 seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
1931 tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
1932 tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
1933 }
1934
1935 /*
1936 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
1937 *
1938 * The flag starts of false and is only set when a suspend reaches
1939 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
1940 * timekeeper clocksource is not stopping across suspend and has been
1941 * used to update sleep time. If the timekeeper clocksource has stopped
1942 * then the flag stays true and is used by the RTC resume code to decide
1943 * whether sleeptime must be injected and if so the flag gets false then.
1944 *
1945 * If a suspend fails before reaching timekeeping_resume() then the flag
1946 * stays false and prevents erroneous sleeptime injection.
1947 */
1948 static bool suspend_timing_needed;
1949
1950 /* Flag for if there is a persistent clock on this platform */
1951 static bool persistent_clock_exists;
1952
1953 /*
1954 * timekeeping_init - Initializes the clocksource and common timekeeping values
1955 */
timekeeping_init(void)1956 void __init timekeeping_init(void)
1957 {
1958 struct timespec64 wall_time, boot_offset, wall_to_mono;
1959 struct timekeeper *tks = &tk_core.shadow_timekeeper;
1960 struct clocksource *clock;
1961
1962 tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
1963 tk_aux_setup();
1964
1965 read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
1966 if (timespec64_valid_settod(&wall_time) &&
1967 timespec64_to_ns(&wall_time) > 0) {
1968 persistent_clock_exists = true;
1969 } else if (timespec64_to_ns(&wall_time) != 0) {
1970 pr_warn("Persistent clock returned invalid value");
1971 wall_time = (struct timespec64){0};
1972 }
1973
1974 if (timespec64_compare(&wall_time, &boot_offset) < 0)
1975 boot_offset = (struct timespec64){0};
1976
1977 /*
1978 * We want set wall_to_mono, so the following is true:
1979 * wall time + wall_to_mono = boot time
1980 */
1981 wall_to_mono = timespec64_sub(boot_offset, wall_time);
1982
1983 guard(raw_spinlock_irqsave)(&tk_core.lock);
1984
1985 ntp_init();
1986
1987 clock = clocksource_default_clock();
1988 if (clock->enable)
1989 clock->enable(clock);
1990 tk_setup_internals(tks, clock);
1991
1992 tk_set_xtime(tks, &wall_time);
1993 tks->raw_sec = 0;
1994
1995 tk_set_wall_to_mono(tks, wall_to_mono);
1996
1997 timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
1998 }
1999
2000 /* time in seconds when suspend began for persistent clock */
2001 static struct timespec64 timekeeping_suspend_time;
2002
2003 /**
2004 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
2005 * @tk: Pointer to the timekeeper to be updated
2006 * @delta: Pointer to the delta value in timespec64 format
2007 *
2008 * Takes a timespec offset measuring a suspend interval and properly
2009 * adds the sleep offset to the timekeeping variables.
2010 */
__timekeeping_inject_sleeptime(struct timekeeper * tk,const struct timespec64 * delta)2011 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
2012 const struct timespec64 *delta)
2013 {
2014 if (!timespec64_valid_strict(delta)) {
2015 printk_deferred(KERN_WARNING
2016 "__timekeeping_inject_sleeptime: Invalid "
2017 "sleep delta value!\n");
2018 return;
2019 }
2020 tk_xtime_add(tk, delta);
2021 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
2022 tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
2023 tk_debug_account_sleep_time(delta);
2024 }
2025
2026 #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
2027 /*
2028 * We have three kinds of time sources to use for sleep time
2029 * injection, the preference order is:
2030 * 1) non-stop clocksource
2031 * 2) persistent clock (ie: RTC accessible when irqs are off)
2032 * 3) RTC
2033 *
2034 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
2035 * If system has neither 1) nor 2), 3) will be used finally.
2036 *
2037 *
2038 * If timekeeping has injected sleeptime via either 1) or 2),
2039 * 3) becomes needless, so in this case we don't need to call
2040 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
2041 * means.
2042 */
timekeeping_rtc_skipresume(void)2043 bool timekeeping_rtc_skipresume(void)
2044 {
2045 return !suspend_timing_needed;
2046 }
2047
2048 /*
2049 * 1) can be determined whether to use or not only when doing
2050 * timekeeping_resume() which is invoked after rtc_suspend(),
2051 * so we can't skip rtc_suspend() surely if system has 1).
2052 *
2053 * But if system has 2), 2) will definitely be used, so in this
2054 * case we don't need to call rtc_suspend(), and this is what
2055 * timekeeping_rtc_skipsuspend() means.
2056 */
timekeeping_rtc_skipsuspend(void)2057 bool timekeeping_rtc_skipsuspend(void)
2058 {
2059 return persistent_clock_exists;
2060 }
2061
2062 /**
2063 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
2064 * @delta: pointer to a timespec64 delta value
2065 *
2066 * This hook is for architectures that cannot support read_persistent_clock64
2067 * because their RTC/persistent clock is only accessible when irqs are enabled.
2068 * and also don't have an effective nonstop clocksource.
2069 *
2070 * This function should only be called by rtc_resume(), and allows
2071 * a suspend offset to be injected into the timekeeping values.
2072 */
timekeeping_inject_sleeptime64(const struct timespec64 * delta)2073 void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
2074 {
2075 scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
2076 struct timekeeper *tks = &tk_core.shadow_timekeeper;
2077
2078 suspend_timing_needed = false;
2079 timekeeping_forward_now(tks);
2080 __timekeeping_inject_sleeptime(tks, delta);
2081 timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
2082 }
2083
2084 /* Signal hrtimers about time change */
2085 clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
2086 }
2087 #endif
2088
2089 /**
2090 * timekeeping_resume - Resumes the generic timekeeping subsystem.
2091 */
timekeeping_resume(void)2092 void timekeeping_resume(void)
2093 {
2094 struct timekeeper *tks = &tk_core.shadow_timekeeper;
2095 struct clocksource *clock = tks->tkr_mono.clock;
2096 struct timespec64 ts_new, ts_delta;
2097 bool inject_sleeptime = false;
2098 u64 cycle_now, nsec;
2099 unsigned long flags;
2100
2101 read_persistent_clock64(&ts_new);
2102
2103 clockevents_resume();
2104 clocksource_resume();
2105
2106 raw_spin_lock_irqsave(&tk_core.lock, flags);
2107
2108 /*
2109 * After system resumes, we need to calculate the suspended time and
2110 * compensate it for the OS time. There are 3 sources that could be
2111 * used: Nonstop clocksource during suspend, persistent clock and rtc
2112 * device.
2113 *
2114 * One specific platform may have 1 or 2 or all of them, and the
2115 * preference will be:
2116 * suspend-nonstop clocksource -> persistent clock -> rtc
2117 * The less preferred source will only be tried if there is no better
2118 * usable source. The rtc part is handled separately in rtc core code.
2119 */
2120 cycle_now = tk_clock_read(&tks->tkr_mono);
2121 nsec = clocksource_stop_suspend_timing(clock, cycle_now);
2122 if (nsec > 0) {
2123 ts_delta = ns_to_timespec64(nsec);
2124 inject_sleeptime = true;
2125 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
2126 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
2127 inject_sleeptime = true;
2128 }
2129
2130 if (inject_sleeptime) {
2131 suspend_timing_needed = false;
2132 __timekeeping_inject_sleeptime(tks, &ts_delta);
2133 }
2134
2135 /* Re-base the last cycle value */
2136 tks->tkr_mono.cycle_last = cycle_now;
2137 tks->tkr_raw.cycle_last = cycle_now;
2138
2139 tks->ntp_error = 0;
2140 timekeeping_suspended = 0;
2141 timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
2142 raw_spin_unlock_irqrestore(&tk_core.lock, flags);
2143
2144 touch_softlockup_watchdog();
2145
2146 /* Resume the clockevent device(s) and hrtimers */
2147 tick_resume();
2148 /* Notify timerfd as resume is equivalent to clock_was_set() */
2149 timerfd_resume();
2150 }
2151
timekeeping_syscore_resume(void * data)2152 static void timekeeping_syscore_resume(void *data)
2153 {
2154 timekeeping_resume();
2155 }
2156
timekeeping_suspend(void)2157 int timekeeping_suspend(void)
2158 {
2159 struct timekeeper *tks = &tk_core.shadow_timekeeper;
2160 struct timespec64 delta, delta_delta;
2161 static struct timespec64 old_delta;
2162 struct clocksource *curr_clock;
2163 unsigned long flags;
2164 u64 cycle_now;
2165
2166 read_persistent_clock64(&timekeeping_suspend_time);
2167
2168 /*
2169 * On some systems the persistent_clock can not be detected at
2170 * timekeeping_init by its return value, so if we see a valid
2171 * value returned, update the persistent_clock_exists flag.
2172 */
2173 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
2174 persistent_clock_exists = true;
2175
2176 suspend_timing_needed = true;
2177
2178 raw_spin_lock_irqsave(&tk_core.lock, flags);
2179 timekeeping_forward_now(tks);
2180 timekeeping_suspended = 1;
2181
2182 /*
2183 * Since we've called forward_now, cycle_last stores the value
2184 * just read from the current clocksource. Save this to potentially
2185 * use in suspend timing.
2186 */
2187 curr_clock = tks->tkr_mono.clock;
2188 cycle_now = tks->tkr_mono.cycle_last;
2189 clocksource_start_suspend_timing(curr_clock, cycle_now);
2190
2191 if (persistent_clock_exists) {
2192 /*
2193 * To avoid drift caused by repeated suspend/resumes,
2194 * which each can add ~1 second drift error,
2195 * try to compensate so the difference in system time
2196 * and persistent_clock time stays close to constant.
2197 */
2198 delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
2199 delta_delta = timespec64_sub(delta, old_delta);
2200 if (abs(delta_delta.tv_sec) >= 2) {
2201 /*
2202 * if delta_delta is too large, assume time correction
2203 * has occurred and set old_delta to the current delta.
2204 */
2205 old_delta = delta;
2206 } else {
2207 /* Otherwise try to adjust old_system to compensate */
2208 timekeeping_suspend_time =
2209 timespec64_add(timekeeping_suspend_time, delta_delta);
2210 }
2211 }
2212
2213 timekeeping_update_from_shadow(&tk_core, 0);
2214 halt_fast_timekeeper(tks);
2215 raw_spin_unlock_irqrestore(&tk_core.lock, flags);
2216
2217 tick_suspend();
2218 clocksource_suspend();
2219 clockevents_suspend();
2220
2221 return 0;
2222 }
2223
timekeeping_syscore_suspend(void * data)2224 static int timekeeping_syscore_suspend(void *data)
2225 {
2226 return timekeeping_suspend();
2227 }
2228
2229 /* sysfs resume/suspend bits for timekeeping */
2230 static const struct syscore_ops timekeeping_syscore_ops = {
2231 .resume = timekeeping_syscore_resume,
2232 .suspend = timekeeping_syscore_suspend,
2233 };
2234
2235 static struct syscore timekeeping_syscore = {
2236 .ops = &timekeeping_syscore_ops,
2237 };
2238
timekeeping_init_ops(void)2239 static int __init timekeeping_init_ops(void)
2240 {
2241 register_syscore(&timekeeping_syscore);
2242 return 0;
2243 }
2244 device_initcall(timekeeping_init_ops);
2245
2246 /*
2247 * Apply a multiplier adjustment to the timekeeper
2248 */
timekeeping_apply_adjustment(struct timekeeper * tk,s64 offset,s32 mult_adj)2249 static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
2250 s64 offset,
2251 s32 mult_adj)
2252 {
2253 s64 interval = tk->cycle_interval;
2254
2255 if (mult_adj == 0) {
2256 return;
2257 } else if (mult_adj == -1) {
2258 interval = -interval;
2259 offset = -offset;
2260 } else if (mult_adj != 1) {
2261 interval *= mult_adj;
2262 offset *= mult_adj;
2263 }
2264
2265 /*
2266 * So the following can be confusing.
2267 *
2268 * To keep things simple, lets assume mult_adj == 1 for now.
2269 *
2270 * When mult_adj != 1, remember that the interval and offset values
2271 * have been appropriately scaled so the math is the same.
2272 *
2273 * The basic idea here is that we're increasing the multiplier
2274 * by one, this causes the xtime_interval to be incremented by
2275 * one cycle_interval. This is because:
2276 * xtime_interval = cycle_interval * mult
2277 * So if mult is being incremented by one:
2278 * xtime_interval = cycle_interval * (mult + 1)
2279 * Its the same as:
2280 * xtime_interval = (cycle_interval * mult) + cycle_interval
2281 * Which can be shortened to:
2282 * xtime_interval += cycle_interval
2283 *
2284 * So offset stores the non-accumulated cycles. Thus the current
2285 * time (in shifted nanoseconds) is:
2286 * now = (offset * adj) + xtime_nsec
2287 * Now, even though we're adjusting the clock frequency, we have
2288 * to keep time consistent. In other words, we can't jump back
2289 * in time, and we also want to avoid jumping forward in time.
2290 *
2291 * So given the same offset value, we need the time to be the same
2292 * both before and after the freq adjustment.
2293 * now = (offset * adj_1) + xtime_nsec_1
2294 * now = (offset * adj_2) + xtime_nsec_2
2295 * So:
2296 * (offset * adj_1) + xtime_nsec_1 =
2297 * (offset * adj_2) + xtime_nsec_2
2298 * And we know:
2299 * adj_2 = adj_1 + 1
2300 * So:
2301 * (offset * adj_1) + xtime_nsec_1 =
2302 * (offset * (adj_1+1)) + xtime_nsec_2
2303 * (offset * adj_1) + xtime_nsec_1 =
2304 * (offset * adj_1) + offset + xtime_nsec_2
2305 * Canceling the sides:
2306 * xtime_nsec_1 = offset + xtime_nsec_2
2307 * Which gives us:
2308 * xtime_nsec_2 = xtime_nsec_1 - offset
2309 * Which simplifies to:
2310 * xtime_nsec -= offset
2311 */
2312 if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
2313 /* NTP adjustment caused clocksource mult overflow */
2314 WARN_ON_ONCE(1);
2315 return;
2316 }
2317
2318 tk->tkr_mono.mult += mult_adj;
2319 tk->xtime_interval += interval;
2320 tk->tkr_mono.xtime_nsec -= offset;
2321 }
2322
2323 /*
2324 * Adjust the timekeeper's multiplier to the correct frequency
2325 * and also to reduce the accumulated error value.
2326 */
timekeeping_adjust(struct timekeeper * tk,s64 offset)2327 static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
2328 {
2329 u64 ntp_tl = ntp_tick_length(tk->id);
2330 u32 mult;
2331
2332 /*
2333 * Determine the multiplier from the current NTP tick length.
2334 * Avoid expensive division when the tick length doesn't change.
2335 */
2336 if (likely(tk->ntp_tick == ntp_tl)) {
2337 mult = tk->tkr_mono.mult - tk->ntp_err_mult;
2338 } else {
2339 tk->ntp_tick = ntp_tl;
2340 mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
2341 tk->xtime_remainder, tk->cycle_interval);
2342 }
2343
2344 /*
2345 * If the clock is behind the NTP time, increase the multiplier by 1
2346 * to catch up with it. If it's ahead and there was a remainder in the
2347 * tick division, the clock will slow down. Otherwise it will stay
2348 * ahead until the tick length changes to a non-divisible value.
2349 */
2350 tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
2351 mult += tk->ntp_err_mult;
2352
2353 timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
2354
2355 if (unlikely(tk->tkr_mono.clock->maxadj &&
2356 (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
2357 > tk->tkr_mono.clock->maxadj))) {
2358 printk_once(KERN_WARNING
2359 "Adjusting %s more than 11%% (%ld vs %ld)\n",
2360 tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
2361 (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
2362 }
2363
2364 /*
2365 * It may be possible that when we entered this function, xtime_nsec
2366 * was very small. Further, if we're slightly speeding the clocksource
2367 * in the code above, its possible the required corrective factor to
2368 * xtime_nsec could cause it to underflow.
2369 *
2370 * Now, since we have already accumulated the second and the NTP
2371 * subsystem has been notified via second_overflow(), we need to skip
2372 * the next update.
2373 */
2374 if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
2375 tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
2376 tk->tkr_mono.shift;
2377 tk->xtime_sec--;
2378 tk->skip_second_overflow = 1;
2379 }
2380 }
2381
2382 /*
2383 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
2384 *
2385 * Helper function that accumulates the nsecs greater than a second
2386 * from the xtime_nsec field to the xtime_secs field.
2387 * It also calls into the NTP code to handle leapsecond processing.
2388 */
accumulate_nsecs_to_secs(struct timekeeper * tk)2389 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
2390 {
2391 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
2392 unsigned int clock_set = 0;
2393
2394 while (tk->tkr_mono.xtime_nsec >= nsecps) {
2395 int leap;
2396
2397 tk->tkr_mono.xtime_nsec -= nsecps;
2398 tk->xtime_sec++;
2399
2400 /*
2401 * Skip NTP update if this second was accumulated before,
2402 * i.e. xtime_nsec underflowed in timekeeping_adjust()
2403 */
2404 if (unlikely(tk->skip_second_overflow)) {
2405 tk->skip_second_overflow = 0;
2406 continue;
2407 }
2408
2409 /* Figure out if its a leap sec and apply if needed */
2410 leap = second_overflow(tk->id, tk->xtime_sec);
2411 if (unlikely(leap)) {
2412 struct timespec64 ts;
2413
2414 tk->xtime_sec += leap;
2415
2416 ts.tv_sec = leap;
2417 ts.tv_nsec = 0;
2418 tk_set_wall_to_mono(tk,
2419 timespec64_sub(tk->wall_to_monotonic, ts));
2420
2421 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
2422
2423 clock_set = TK_CLOCK_WAS_SET;
2424 }
2425 }
2426 return clock_set;
2427 }
2428
2429 /*
2430 * logarithmic_accumulation - shifted accumulation of cycles
2431 *
2432 * This functions accumulates a shifted interval of cycles into
2433 * a shifted interval nanoseconds. Allows for O(log) accumulation
2434 * loop.
2435 *
2436 * Returns the unconsumed cycles.
2437 */
logarithmic_accumulation(struct timekeeper * tk,u64 offset,u32 shift,unsigned int * clock_set)2438 static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
2439 u32 shift, unsigned int *clock_set)
2440 {
2441 u64 interval = tk->cycle_interval << shift;
2442 u64 snsec_per_sec;
2443
2444 /* If the offset is smaller than a shifted interval, do nothing */
2445 if (offset < interval)
2446 return offset;
2447
2448 /* Accumulate one shifted interval */
2449 offset -= interval;
2450 tk->tkr_mono.cycle_last += interval;
2451 tk->tkr_raw.cycle_last += interval;
2452
2453 tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
2454 *clock_set |= accumulate_nsecs_to_secs(tk);
2455
2456 /* Accumulate raw time */
2457 tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
2458 snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
2459 while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
2460 tk->tkr_raw.xtime_nsec -= snsec_per_sec;
2461 tk->raw_sec++;
2462 }
2463
2464 /* Accumulate error between NTP and clock interval */
2465 tk->ntp_error += tk->ntp_tick << shift;
2466 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
2467 (tk->ntp_error_shift + shift);
2468
2469 return offset;
2470 }
2471
2472 /*
2473 * timekeeping_advance - Updates the timekeeper to the current time and
2474 * current NTP tick length
2475 */
__timekeeping_advance(struct tk_data * tkd,enum timekeeping_adv_mode mode)2476 static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
2477 {
2478 struct timekeeper *tk = &tkd->shadow_timekeeper;
2479 struct timekeeper *real_tk = &tkd->timekeeper;
2480 unsigned int clock_set = 0;
2481 int shift = 0, maxshift;
2482 u64 offset, orig_offset;
2483
2484 /* Make sure we're fully resumed: */
2485 if (unlikely(timekeeping_suspended))
2486 return false;
2487
2488 offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
2489 tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
2490 tk->tkr_mono.clock->max_raw_delta);
2491 orig_offset = offset;
2492 /* Check if there's really nothing to do */
2493 if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
2494 return false;
2495
2496 /*
2497 * With NO_HZ we may have to accumulate many cycle_intervals
2498 * (think "ticks") worth of time at once. To do this efficiently,
2499 * we calculate the largest doubling multiple of cycle_intervals
2500 * that is smaller than the offset. We then accumulate that
2501 * chunk in one go, and then try to consume the next smaller
2502 * doubled multiple.
2503 */
2504 shift = ilog2(offset) - ilog2(tk->cycle_interval);
2505 shift = max(0, shift);
2506 /* Bound shift to one less than what overflows tick_length */
2507 maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
2508 shift = min(shift, maxshift);
2509 while (offset >= tk->cycle_interval) {
2510 offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
2511 if (offset < tk->cycle_interval<<shift)
2512 shift--;
2513 }
2514
2515 /* Adjust the multiplier to correct NTP error */
2516 timekeeping_adjust(tk, offset);
2517
2518 /*
2519 * Finally, make sure that after the rounding
2520 * xtime_nsec isn't larger than NSEC_PER_SEC
2521 */
2522 clock_set |= accumulate_nsecs_to_secs(tk);
2523
2524 /*
2525 * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
2526 * making small negative adjustments to the base xtime_nsec
2527 * value, only update the coarse clocks if we accumulated time
2528 */
2529 if (orig_offset != offset)
2530 tk_update_coarse_nsecs(tk);
2531
2532 timekeeping_update_from_shadow(tkd, clock_set);
2533
2534 return !!clock_set;
2535 }
2536
timekeeping_advance(enum timekeeping_adv_mode mode)2537 static bool timekeeping_advance(enum timekeeping_adv_mode mode)
2538 {
2539 guard(raw_spinlock_irqsave)(&tk_core.lock);
2540 return __timekeeping_advance(&tk_core, mode);
2541 }
2542
2543 /**
2544 * update_wall_time - Uses the current clocksource to increment the wall time
2545 *
2546 * It also updates the enabled auxiliary clock timekeepers
2547 */
update_wall_time(void)2548 void update_wall_time(void)
2549 {
2550 if (timekeeping_advance(TK_ADV_TICK))
2551 clock_was_set_delayed();
2552 tk_aux_advance();
2553 }
2554
2555 /**
2556 * getboottime64 - Return the real time of system boot.
2557 * @ts: pointer to the timespec64 to be set
2558 *
2559 * Returns the wall-time of boot in a timespec64.
2560 *
2561 * This is based on the wall_to_monotonic offset and the total suspend
2562 * time. Calls to settimeofday will affect the value returned (which
2563 * basically means that however wrong your real time clock is at boot time,
2564 * you get the right time here).
2565 */
getboottime64(struct timespec64 * ts)2566 void getboottime64(struct timespec64 *ts)
2567 {
2568 struct timekeeper *tk = &tk_core.timekeeper;
2569 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
2570
2571 *ts = ktime_to_timespec64(t);
2572 }
2573 EXPORT_SYMBOL_GPL(getboottime64);
2574
ktime_get_coarse_real_ts64(struct timespec64 * ts)2575 void ktime_get_coarse_real_ts64(struct timespec64 *ts)
2576 {
2577 struct timekeeper *tk = &tk_core.timekeeper;
2578 unsigned int seq;
2579
2580 do {
2581 seq = read_seqcount_begin(&tk_core.seq);
2582
2583 *ts = tk_xtime_coarse(tk);
2584 } while (read_seqcount_retry(&tk_core.seq, seq));
2585 }
2586 EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
2587
2588 /**
2589 * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
2590 * @ts: timespec64 to be filled
2591 *
2592 * Fetch the global mg_floor value, convert it to realtime and compare it
2593 * to the current coarse-grained time. Fill @ts with whichever is
2594 * latest. Note that this is a filesystem-specific interface and should be
2595 * avoided outside of that context.
2596 */
ktime_get_coarse_real_ts64_mg(struct timespec64 * ts)2597 void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
2598 {
2599 struct timekeeper *tk = &tk_core.timekeeper;
2600 u64 floor = atomic64_read(&mg_floor);
2601 ktime_t f_real, offset, coarse;
2602 unsigned int seq;
2603
2604 do {
2605 seq = read_seqcount_begin(&tk_core.seq);
2606 *ts = tk_xtime_coarse(tk);
2607 offset = tk_core.timekeeper.offs_real;
2608 } while (read_seqcount_retry(&tk_core.seq, seq));
2609
2610 coarse = timespec64_to_ktime(*ts);
2611 f_real = ktime_add(floor, offset);
2612 if (ktime_after(f_real, coarse))
2613 *ts = ktime_to_timespec64(f_real);
2614 }
2615
2616 /**
2617 * ktime_get_real_ts64_mg - attempt to update floor value and return result
2618 * @ts: pointer to the timespec to be set
2619 *
2620 * Get a monotonic fine-grained time value and attempt to swap it into
2621 * mg_floor. If that succeeds then accept the new floor value. If it fails
2622 * then another task raced in during the interim time and updated the
2623 * floor. Since any update to the floor must be later than the previous
2624 * floor, either outcome is acceptable.
2625 *
2626 * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
2627 * and determining that the resulting coarse-grained timestamp did not effect
2628 * a change in ctime. Any more recent floor value would effect a change to
2629 * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
2630 *
2631 * @ts will be filled with the latest floor value, regardless of the outcome of
2632 * the cmpxchg. Note that this is a filesystem specific interface and should be
2633 * avoided outside of that context.
2634 */
ktime_get_real_ts64_mg(struct timespec64 * ts)2635 void ktime_get_real_ts64_mg(struct timespec64 *ts)
2636 {
2637 struct timekeeper *tk = &tk_core.timekeeper;
2638 ktime_t old = atomic64_read(&mg_floor);
2639 ktime_t offset, mono;
2640 unsigned int seq;
2641 u64 nsecs;
2642
2643 do {
2644 seq = read_seqcount_begin(&tk_core.seq);
2645
2646 ts->tv_sec = tk->xtime_sec;
2647 mono = tk->tkr_mono.base;
2648 nsecs = timekeeping_get_ns(&tk->tkr_mono);
2649 offset = tk_core.timekeeper.offs_real;
2650 } while (read_seqcount_retry(&tk_core.seq, seq));
2651
2652 mono = ktime_add_ns(mono, nsecs);
2653
2654 /*
2655 * Attempt to update the floor with the new time value. As any
2656 * update must be later then the existing floor, and would effect
2657 * a change to ctime from the perspective of the current task,
2658 * accept the resulting floor value regardless of the outcome of
2659 * the swap.
2660 */
2661 if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
2662 ts->tv_nsec = 0;
2663 timespec64_add_ns(ts, nsecs);
2664 timekeeping_inc_mg_floor_swaps();
2665 } else {
2666 /*
2667 * Another task changed mg_floor since "old" was fetched.
2668 * "old" has been updated with the latest value of "mg_floor".
2669 * That value is newer than the previous floor value, which
2670 * is enough to effect a change to ctime. Accept it.
2671 */
2672 *ts = ktime_to_timespec64(ktime_add(old, offset));
2673 }
2674 }
2675
ktime_get_coarse_ts64(struct timespec64 * ts)2676 void ktime_get_coarse_ts64(struct timespec64 *ts)
2677 {
2678 struct timekeeper *tk = &tk_core.timekeeper;
2679 struct timespec64 now, mono;
2680 unsigned int seq;
2681
2682 do {
2683 seq = read_seqcount_begin(&tk_core.seq);
2684
2685 now = tk_xtime_coarse(tk);
2686 mono = tk->wall_to_monotonic;
2687 } while (read_seqcount_retry(&tk_core.seq, seq));
2688
2689 set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
2690 now.tv_nsec + mono.tv_nsec);
2691 }
2692 EXPORT_SYMBOL(ktime_get_coarse_ts64);
2693
2694 /*
2695 * Must hold jiffies_lock
2696 */
do_timer(unsigned long ticks)2697 void do_timer(unsigned long ticks)
2698 {
2699 jiffies_64 += ticks;
2700 calc_global_load();
2701 }
2702
2703 /**
2704 * ktime_get_update_offsets_now - hrtimer helper
2705 * @cwsseq: pointer to check and store the clock was set sequence number
2706 * @offs_real: pointer to storage for monotonic -> realtime offset
2707 * @offs_boot: pointer to storage for monotonic -> boottime offset
2708 * @offs_tai: pointer to storage for monotonic -> clock tai offset
2709 *
2710 * Returns current monotonic time and updates the offsets if the
2711 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
2712 * different.
2713 *
2714 * Called from hrtimer_interrupt() or retrigger_next_event()
2715 */
ktime_get_update_offsets_now(unsigned int * cwsseq,ktime_t * offs_real,ktime_t * offs_boot,ktime_t * offs_tai)2716 ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
2717 ktime_t *offs_boot, ktime_t *offs_tai)
2718 {
2719 struct timekeeper *tk = &tk_core.timekeeper;
2720 unsigned int seq;
2721 ktime_t base;
2722 u64 nsecs;
2723
2724 do {
2725 seq = read_seqcount_begin(&tk_core.seq);
2726
2727 base = tk->tkr_mono.base;
2728 nsecs = timekeeping_get_ns(&tk->tkr_mono);
2729 base = ktime_add_ns(base, nsecs);
2730
2731 if (*cwsseq != tk->clock_was_set_seq) {
2732 *cwsseq = tk->clock_was_set_seq;
2733 *offs_real = tk->offs_real;
2734 *offs_boot = tk->offs_boot;
2735 *offs_tai = tk->offs_tai;
2736 }
2737
2738 /* Handle leapsecond insertion adjustments */
2739 if (unlikely(base >= tk->next_leap_ktime))
2740 *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
2741
2742 } while (read_seqcount_retry(&tk_core.seq, seq));
2743
2744 return base;
2745 }
2746
2747 /*
2748 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
2749 */
timekeeping_validate_timex(const struct __kernel_timex * txc,bool aux_clock)2750 static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
2751 {
2752 if (txc->modes & ADJ_ADJTIME) {
2753 /* singleshot must not be used with any other mode bits */
2754 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
2755 return -EINVAL;
2756 if (!(txc->modes & ADJ_OFFSET_READONLY) &&
2757 !capable(CAP_SYS_TIME))
2758 return -EPERM;
2759 } else {
2760 /* In order to modify anything, you gotta be super-user! */
2761 if (txc->modes && !capable(CAP_SYS_TIME))
2762 return -EPERM;
2763 /*
2764 * if the quartz is off by more than 10% then
2765 * something is VERY wrong!
2766 */
2767 if (txc->modes & ADJ_TICK &&
2768 (txc->tick < 900000/USER_HZ ||
2769 txc->tick > 1100000/USER_HZ))
2770 return -EINVAL;
2771 }
2772
2773 if (txc->modes & ADJ_SETOFFSET) {
2774 /* In order to inject time, you gotta be super-user! */
2775 if (!capable(CAP_SYS_TIME))
2776 return -EPERM;
2777
2778 /*
2779 * Validate if a timespec/timeval used to inject a time
2780 * offset is valid. Offsets can be positive or negative, so
2781 * we don't check tv_sec. The value of the timeval/timespec
2782 * is the sum of its fields,but *NOTE*:
2783 * The field tv_usec/tv_nsec must always be non-negative and
2784 * we can't have more nanoseconds/microseconds than a second.
2785 */
2786 if (txc->time.tv_usec < 0)
2787 return -EINVAL;
2788
2789 if (txc->modes & ADJ_NANO) {
2790 if (txc->time.tv_usec >= NSEC_PER_SEC)
2791 return -EINVAL;
2792 } else {
2793 if (txc->time.tv_usec >= USEC_PER_SEC)
2794 return -EINVAL;
2795 }
2796 }
2797
2798 /*
2799 * Check for potential multiplication overflows that can
2800 * only happen on 64-bit systems:
2801 */
2802 if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
2803 if (LLONG_MIN / PPM_SCALE > txc->freq)
2804 return -EINVAL;
2805 if (LLONG_MAX / PPM_SCALE < txc->freq)
2806 return -EINVAL;
2807 }
2808
2809 if (aux_clock) {
2810 /* Auxiliary clocks are similar to TAI and do not have leap seconds */
2811 if (txc->modes & ADJ_STATUS &&
2812 txc->status & (STA_INS | STA_DEL))
2813 return -EINVAL;
2814
2815 /* No TAI offset setting */
2816 if (txc->modes & ADJ_TAI)
2817 return -EINVAL;
2818
2819 /* No PPS support either */
2820 if (txc->modes & ADJ_STATUS &&
2821 txc->status & (STA_PPSFREQ | STA_PPSTIME))
2822 return -EINVAL;
2823 }
2824
2825 return 0;
2826 }
2827
2828 /**
2829 * random_get_entropy_fallback - Returns the raw clock source value,
2830 * used by random.c for platforms with no valid random_get_entropy().
2831 */
random_get_entropy_fallback(void)2832 unsigned long random_get_entropy_fallback(void)
2833 {
2834 struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
2835 struct clocksource *clock = READ_ONCE(tkr->clock);
2836
2837 if (unlikely(timekeeping_suspended || !clock))
2838 return 0;
2839 return clock->read(clock);
2840 }
2841 EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
2842
2843 struct adjtimex_result {
2844 struct audit_ntp_data ad;
2845 struct timespec64 delta;
2846 bool clock_set;
2847 };
2848
__do_adjtimex(struct tk_data * tkd,struct __kernel_timex * txc,struct adjtimex_result * result)2849 static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
2850 struct adjtimex_result *result)
2851 {
2852 struct timekeeper *tks = &tkd->shadow_timekeeper;
2853 bool aux_clock = !timekeeper_is_core_tk(tks);
2854 struct timespec64 ts;
2855 s32 orig_tai, tai;
2856 int ret;
2857
2858 /* Validate the data before disabling interrupts */
2859 ret = timekeeping_validate_timex(txc, aux_clock);
2860 if (ret)
2861 return ret;
2862 add_device_randomness(txc, sizeof(*txc));
2863
2864 if (!aux_clock)
2865 ktime_get_real_ts64(&ts);
2866 else
2867 tk_get_aux_ts64(tkd->timekeeper.id, &ts);
2868
2869 add_device_randomness(&ts, sizeof(ts));
2870
2871 guard(raw_spinlock_irqsave)(&tkd->lock);
2872
2873 if (!tks->clock_valid)
2874 return -ENODEV;
2875
2876 if (txc->modes & ADJ_SETOFFSET) {
2877 result->delta.tv_sec = txc->time.tv_sec;
2878 result->delta.tv_nsec = txc->time.tv_usec;
2879 if (!(txc->modes & ADJ_NANO))
2880 result->delta.tv_nsec *= 1000;
2881 ret = __timekeeping_inject_offset(tkd, &result->delta);
2882 if (ret)
2883 return ret;
2884 result->clock_set = true;
2885 }
2886
2887 orig_tai = tai = tks->tai_offset;
2888 ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
2889
2890 if (tai != orig_tai) {
2891 __timekeeping_set_tai_offset(tks, tai);
2892 timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
2893 result->clock_set = true;
2894 } else {
2895 tk_update_leap_state_all(tkd);
2896 }
2897
2898 /* Update the multiplier immediately if frequency was set directly */
2899 if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
2900 result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
2901
2902 return ret;
2903 }
2904
2905 /**
2906 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
2907 * @txc: Pointer to kernel_timex structure containing NTP parameters
2908 */
do_adjtimex(struct __kernel_timex * txc)2909 int do_adjtimex(struct __kernel_timex *txc)
2910 {
2911 struct adjtimex_result result = { };
2912 int ret;
2913
2914 ret = __do_adjtimex(&tk_core, txc, &result);
2915 if (ret < 0)
2916 return ret;
2917
2918 if (txc->modes & ADJ_SETOFFSET)
2919 audit_tk_injoffset(result.delta);
2920
2921 audit_ntp_log(&result.ad);
2922
2923 if (result.clock_set)
2924 clock_was_set(CLOCK_SET_WALL);
2925
2926 ntp_notify_cmos_timer(result.delta.tv_sec != 0);
2927
2928 return ret;
2929 }
2930
2931 /*
2932 * Invoked from NTP with the time keeper lock held, so lockless access is
2933 * fine.
2934 */
ktime_get_ntp_seconds(unsigned int id)2935 long ktime_get_ntp_seconds(unsigned int id)
2936 {
2937 return timekeeper_data[id].timekeeper.xtime_sec;
2938 }
2939
2940 #ifdef CONFIG_NTP_PPS
2941 /**
2942 * hardpps() - Accessor function to NTP __hardpps function
2943 * @phase_ts: Pointer to timespec64 structure representing phase timestamp
2944 * @raw_ts: Pointer to timespec64 structure representing raw timestamp
2945 */
hardpps(const struct timespec64 * phase_ts,const struct timespec64 * raw_ts)2946 void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
2947 {
2948 guard(raw_spinlock_irqsave)(&tk_core.lock);
2949 __hardpps(phase_ts, raw_ts);
2950 }
2951 EXPORT_SYMBOL(hardpps);
2952 #endif /* CONFIG_NTP_PPS */
2953
2954 #ifdef CONFIG_POSIX_AUX_CLOCKS
2955 #include "posix-timers.h"
2956
2957 /*
2958 * Bitmap for the activated auxiliary timekeepers to allow lockless quick
2959 * checks in the hot paths without touching extra cache lines. If set, then
2960 * the state of the corresponding timekeeper has to be re-checked under
2961 * timekeeper::lock.
2962 */
2963 static unsigned long aux_timekeepers;
2964
clockid_to_tkid(unsigned int id)2965 static inline unsigned int clockid_to_tkid(unsigned int id)
2966 {
2967 return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
2968 }
2969
aux_get_tk_data(clockid_t id)2970 static inline struct tk_data *aux_get_tk_data(clockid_t id)
2971 {
2972 if (!clockid_aux_valid(id))
2973 return NULL;
2974 return &timekeeper_data[clockid_to_tkid(id)];
2975 }
2976
2977 /* Invoked from timekeeping after a clocksource change */
tk_aux_update_clocksource(void)2978 static void tk_aux_update_clocksource(void)
2979 {
2980 unsigned long active = READ_ONCE(aux_timekeepers);
2981 unsigned int id;
2982
2983 for_each_set_bit(id, &active, BITS_PER_LONG) {
2984 struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
2985 struct timekeeper *tks = &tkd->shadow_timekeeper;
2986
2987 guard(raw_spinlock_irqsave)(&tkd->lock);
2988 if (!tks->clock_valid)
2989 continue;
2990
2991 timekeeping_forward_now(tks);
2992 tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
2993 timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
2994 }
2995 }
2996
tk_aux_advance(void)2997 static void tk_aux_advance(void)
2998 {
2999 unsigned long active = READ_ONCE(aux_timekeepers);
3000 unsigned int id;
3001
3002 /* Lockless quick check to avoid extra cache lines */
3003 for_each_set_bit(id, &active, BITS_PER_LONG) {
3004 struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
3005
3006 guard(raw_spinlock)(&aux_tkd->lock);
3007 if (aux_tkd->shadow_timekeeper.clock_valid)
3008 __timekeeping_advance(aux_tkd, TK_ADV_TICK);
3009 }
3010 }
3011
3012 /**
3013 * ktime_get_aux - Get time for a AUX clock
3014 * @id: ID of the clock to read (CLOCK_AUX...)
3015 * @kt: Pointer to ktime_t to store the time stamp
3016 *
3017 * Returns: True if the timestamp is valid, false otherwise
3018 */
ktime_get_aux(clockid_t id,ktime_t * kt)3019 bool ktime_get_aux(clockid_t id, ktime_t *kt)
3020 {
3021 struct tk_data *aux_tkd = aux_get_tk_data(id);
3022 struct timekeeper *aux_tk;
3023 unsigned int seq;
3024 ktime_t base;
3025 u64 nsecs;
3026
3027 WARN_ON(timekeeping_suspended);
3028
3029 if (!aux_tkd)
3030 return false;
3031
3032 aux_tk = &aux_tkd->timekeeper;
3033 do {
3034 seq = read_seqcount_begin(&aux_tkd->seq);
3035 if (!aux_tk->clock_valid)
3036 return false;
3037
3038 base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
3039 nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
3040 } while (read_seqcount_retry(&aux_tkd->seq, seq));
3041
3042 *kt = ktime_add_ns(base, nsecs);
3043 return true;
3044 }
3045 EXPORT_SYMBOL_GPL(ktime_get_aux);
3046
3047 /**
3048 * ktime_get_aux_ts64 - Get time for a AUX clock
3049 * @id: ID of the clock to read (CLOCK_AUX...)
3050 * @ts: Pointer to timespec64 to store the time stamp
3051 *
3052 * Returns: True if the timestamp is valid, false otherwise
3053 */
ktime_get_aux_ts64(clockid_t id,struct timespec64 * ts)3054 bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
3055 {
3056 ktime_t now;
3057
3058 if (!ktime_get_aux(id, &now))
3059 return false;
3060 *ts = ktime_to_timespec64(now);
3061 return true;
3062 }
3063 EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
3064
aux_get_res(clockid_t id,struct timespec64 * tp)3065 static int aux_get_res(clockid_t id, struct timespec64 *tp)
3066 {
3067 if (!clockid_aux_valid(id))
3068 return -ENODEV;
3069
3070 tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
3071 tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
3072 return 0;
3073 }
3074
aux_get_timespec(clockid_t id,struct timespec64 * tp)3075 static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
3076 {
3077 return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
3078 }
3079
aux_clock_set(const clockid_t id,const struct timespec64 * tnew)3080 static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
3081 {
3082 struct tk_data *aux_tkd = aux_get_tk_data(id);
3083 struct timekeeper *aux_tks;
3084 ktime_t tnow, nsecs;
3085
3086 if (!timespec64_valid_settod(tnew))
3087 return -EINVAL;
3088 if (!aux_tkd)
3089 return -ENODEV;
3090
3091 aux_tks = &aux_tkd->shadow_timekeeper;
3092
3093 guard(raw_spinlock_irq)(&aux_tkd->lock);
3094 if (!aux_tks->clock_valid)
3095 return -ENODEV;
3096
3097 /* Forward the timekeeper base time */
3098 timekeeping_forward_now(aux_tks);
3099 /*
3100 * Get the updated base time. tkr_mono.base has not been
3101 * updated yet, so do that first. That makes the update
3102 * in timekeeping_update_from_shadow() redundant, but
3103 * that's harmless. After that @tnow can be calculated
3104 * by using tkr_mono::cycle_last, which has been set
3105 * by timekeeping_forward_now().
3106 */
3107 tk_update_ktime_data(aux_tks);
3108 nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
3109 tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
3110
3111 /*
3112 * Calculate the new AUX offset as delta to @tnow ("monotonic").
3113 * That avoids all the tk::xtime back and forth conversions as
3114 * xtime ("realtime") is not applicable for auxiliary clocks and
3115 * kept in sync with "monotonic".
3116 */
3117 tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
3118
3119 timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
3120 return 0;
3121 }
3122
aux_clock_adj(const clockid_t id,struct __kernel_timex * txc)3123 static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
3124 {
3125 struct tk_data *aux_tkd = aux_get_tk_data(id);
3126 struct adjtimex_result result = { };
3127
3128 if (!aux_tkd)
3129 return -ENODEV;
3130
3131 /*
3132 * @result is ignored for now as there are neither hrtimers nor a
3133 * RTC related to auxiliary clocks for now.
3134 */
3135 return __do_adjtimex(aux_tkd, txc, &result);
3136 }
3137
3138 const struct k_clock clock_aux = {
3139 .clock_getres = aux_get_res,
3140 .clock_get_timespec = aux_get_timespec,
3141 .clock_set = aux_clock_set,
3142 .clock_adj = aux_clock_adj,
3143 };
3144
aux_clock_enable(clockid_t id)3145 static void aux_clock_enable(clockid_t id)
3146 {
3147 struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
3148 struct tk_data *aux_tkd = aux_get_tk_data(id);
3149 struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
3150
3151 /* Prevent the core timekeeper from changing. */
3152 guard(raw_spinlock_irq)(&tk_core.lock);
3153
3154 /*
3155 * Setup the auxiliary clock assuming that the raw core timekeeper
3156 * clock frequency conversion is close enough. Userspace has to
3157 * adjust for the deviation via clock_adjtime(2).
3158 */
3159 guard(raw_spinlock_nested)(&aux_tkd->lock);
3160
3161 /* Remove leftovers of a previous registration */
3162 memset(aux_tks, 0, sizeof(*aux_tks));
3163 /* Restore the timekeeper id */
3164 aux_tks->id = aux_tkd->timekeeper.id;
3165 /* Setup the timekeeper based on the current system clocksource */
3166 tk_setup_internals(aux_tks, tkr_raw->clock);
3167
3168 /* Mark it valid and set it live */
3169 aux_tks->clock_valid = true;
3170 timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
3171 }
3172
aux_clock_disable(clockid_t id)3173 static void aux_clock_disable(clockid_t id)
3174 {
3175 struct tk_data *aux_tkd = aux_get_tk_data(id);
3176
3177 guard(raw_spinlock_irq)(&aux_tkd->lock);
3178 aux_tkd->shadow_timekeeper.clock_valid = false;
3179 timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
3180 }
3181
3182 static DEFINE_MUTEX(aux_clock_mutex);
3183
aux_clock_enable_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3184 static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
3185 const char *buf, size_t count)
3186 {
3187 /* Lazy atoi() as name is "0..7" */
3188 int id = kobj->name[0] & 0x7;
3189 bool enable;
3190
3191 if (!capable(CAP_SYS_TIME))
3192 return -EPERM;
3193
3194 if (kstrtobool(buf, &enable) < 0)
3195 return -EINVAL;
3196
3197 guard(mutex)(&aux_clock_mutex);
3198 if (enable == test_bit(id, &aux_timekeepers))
3199 return count;
3200
3201 if (enable) {
3202 aux_clock_enable(CLOCK_AUX + id);
3203 set_bit(id, &aux_timekeepers);
3204 } else {
3205 aux_clock_disable(CLOCK_AUX + id);
3206 clear_bit(id, &aux_timekeepers);
3207 }
3208 return count;
3209 }
3210
aux_clock_enable_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3211 static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
3212 {
3213 unsigned long active = READ_ONCE(aux_timekeepers);
3214 /* Lazy atoi() as name is "0..7" */
3215 int id = kobj->name[0] & 0x7;
3216
3217 return sysfs_emit(buf, "%d\n", test_bit(id, &active));
3218 }
3219
3220 static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
3221
3222 static struct attribute *aux_clock_enable_attrs[] = {
3223 &aux_clock_enable_attr.attr,
3224 NULL
3225 };
3226
3227 static const struct attribute_group aux_clock_enable_attr_group = {
3228 .attrs = aux_clock_enable_attrs,
3229 };
3230
tk_aux_sysfs_init(void)3231 static int __init tk_aux_sysfs_init(void)
3232 {
3233 struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
3234 int ret = -ENOMEM;
3235
3236 if (!tko)
3237 return ret;
3238
3239 auxo = kobject_create_and_add("aux_clocks", tko);
3240 if (!auxo)
3241 goto err_clean;
3242
3243 for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
3244 char id[2] = { [0] = '0' + i, };
3245 struct kobject *clk = kobject_create_and_add(id, auxo);
3246
3247 if (!clk) {
3248 ret = -ENOMEM;
3249 goto err_clean;
3250 }
3251
3252 ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
3253 if (ret)
3254 goto err_clean;
3255 }
3256 return 0;
3257
3258 err_clean:
3259 kobject_put(auxo);
3260 kobject_put(tko);
3261 return ret;
3262 }
3263 late_initcall(tk_aux_sysfs_init);
3264
tk_aux_setup(void)3265 static __init void tk_aux_setup(void)
3266 {
3267 for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
3268 tkd_basic_setup(&timekeeper_data[i], i, false);
3269 }
3270 #endif /* CONFIG_POSIX_AUX_CLOCKS */
3271